From e24606ea2c751434afd19752cb07c09df327b7e9 Mon Sep 17 00:00:00 2001 From: Stephen Seo Date: Sat, 23 Jan 2021 16:42:45 +0900 Subject: [PATCH] Impl use of OpenCL for generating blue noise --- Makefile | 5 +- src/blue_noise.cl | 25 +++ src/blue_noise.cpp | 446 ++++++++++++++++++++++++++++++++++++++++++--- src/blue_noise.hpp | 34 ++++ src/main.cpp | 2 +- 5 files changed, 486 insertions(+), 26 deletions(-) create mode 100644 src/blue_noise.cl diff --git a/Makefile b/Makefile index 449bec9..7eb9013 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ -COMMON_FLAGS=-Wall -Wextra -Wpedantic -std=c++17 -lpthread +COMMON_FLAGS=-Wall -Wextra -Wpedantic -std=c++17 +LINKER_FLAGS=-lpthread -lOpenCL ifdef DEBUG CPPFLAGS=${COMMON_FLAGS} -g -O0 else @@ -14,7 +15,7 @@ OBJECTS=${subst .cpp,.o,${SOURCES}} all: Dithering Dithering: ${OBJECTS} - ${CXX} ${CPPFLAGS} -o Dithering $^ + ${CXX} ${CPPFLAGS} ${LINKER_FLAGS} -o Dithering $^ .PHONY: diff --git a/src/blue_noise.cl b/src/blue_noise.cl new file mode 100644 index 0000000..48a94b5 --- /dev/null +++ b/src/blue_noise.cl @@ -0,0 +1,25 @@ +__kernel void do_filter( + __global float *filter_out, __global float *precomputed, + __global int *pbp, int width, int height, int filter_size) { + int i = get_global_id(0); + if(i < 0 || i >= width * height) { + return; + } + int x = i % width; + int y = i / width; + + float sum = 0.0f; + for(int q = 0; q < filter_size; ++q) { + int q_prime = (height + filter_size / 2 + y - q) % height; + for(int p = 0; p < filter_size; ++p) { + int p_prime = (width + filter_size / 2 + x - p) % width; + if(pbp[p_prime + q_prime * width] != 0) { + sum += precomputed[p + q * filter_size]; + } + } + } + + filter_out[i] = sum; +} + +// vim: syntax=c diff --git a/src/blue_noise.cpp b/src/blue_noise.cpp index c5c1f32..1f0aea9 100644 --- a/src/blue_noise.cpp +++ b/src/blue_noise.cpp @@ -3,42 +3,110 @@ #include #include #include +#include #include +#include + #ifndef NDEBUG # include #endif std::vector dither::blue_noise(int width, int height, int threads) { - int count = width * height; - std::vector filter_out; - filter_out.resize(count); - std::vector pbp; // Prototype Binary Pattern - pbp.resize(count); + bool use_opencl = false; - std::default_random_engine re(std::random_device{}()); - std::uniform_int_distribution dist(0, count - 1); + // try to use OpenCL + do { + cl_device_id device; + cl_context context; + cl_program program; + cl_int err; - const int pixel_count = count * 4 / 10; + cl_platform_id platform; - // initialize pbp - for(int i = 0; i < count; ++i) { - if(i < pixel_count) { - pbp[i] = true; - } else { - pbp[i] = false; + int filter_size = (width + height) / 2; + + err = clGetPlatformIDs(1, &platform, nullptr); + if(err != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to identify a platform\n"; + break; } + + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, nullptr); + if(err != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to get a device\n"; + break; + } + + context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err); + + { + char buf[1024]; + std::ifstream program_file("src/blue_noise.cl"); + std::string program_string; + while(program_file.good()) { + program_file.read(buf, 1024); + if(int read_count = program_file.gcount(); read_count > 0) { + program_string.append(buf, read_count); + } + } + + const char *string_ptr = program_string.c_str(); + std::size_t program_size = program_string.size(); + program = clCreateProgramWithSource(context, 1, (const char**)&string_ptr, &program_size, &err); + if(err != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to create the program\n"; + clReleaseContext(context); + break; + } + + err = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr); + if(err != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to build the program\n"; + + std::size_t log_size; + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_size); + std::unique_ptr log = std::make_unique(log_size + 1); + log[log_size] = 0; + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log.get(), nullptr); + std::cerr << log.get() << std::endl; + + clReleaseProgram(program); + clReleaseContext(context); + break; + } + } + + std::cout << "OpenCL: Initialized, trying cl_impl..." << std::endl; + std::vector result = internal::blue_noise_cl_impl( + width, height, filter_size, context, device, program); + + clReleaseProgram(program); + clReleaseContext(context); + + if(!result.empty()) { + return result; + } + } while (false); + + if(!use_opencl) { + std::cout << "OpenCL: Failed to setup/use, using regular impl..." << std::endl; + return internal::blue_noise_impl(width, height, threads); } - // randomize pbp - for(int i = 0; i < count-1; ++i) { - decltype(dist)::param_type range{i+1, count-1}; - int ridx = dist(re, range); - // probably can't use std::swap since using std::vector - bool temp = pbp[i]; - pbp[i] = pbp[ridx]; - pbp[ridx] = temp; - } + + return {}; +} + +std::vector dither::internal::blue_noise_impl(int width, int height, int threads) { + int count = width * height; + std::vector filter_out; + filter_out.resize(count); + + int pixel_count = count * 4 / 10; + std::vector pbp = random_noise(count, count * 4 / 10); + pbp.resize(count); + //#ifndef NDEBUG printf("Inserting %d pixels into image of max count %d\n", pixel_count, count); // generate image from randomized pbp @@ -167,3 +235,335 @@ std::vector dither::blue_noise(int width, int height, int threads) { return pbp; } + +std::vector dither::internal::blue_noise_cl_impl( + int width, int height, int filter_size, cl_context context, cl_device_id device, cl_program program) { + cl_int err; + cl_kernel kernel; + cl_command_queue queue; + cl_mem d_filter_out, d_precomputed, d_pbp; + std::size_t global_size, local_size; + + std::vector precomputed = precompute_gaussian(filter_size); + + int count = width * height; + int pixel_count = count * 4 / 10; + std::vector pbp = random_noise(count, pixel_count); + std::vector pbp_i(pbp.size()); + + queue = clCreateCommandQueueWithProperties(context, device, nullptr, &err); + + d_filter_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, count * sizeof(float), nullptr, nullptr); + d_precomputed = clCreateBuffer(context, CL_MEM_READ_ONLY, filter_size * filter_size * sizeof(float), nullptr, nullptr); + d_pbp = clCreateBuffer(context, CL_MEM_READ_ONLY, count * sizeof(int), nullptr, nullptr); + + err = clEnqueueWriteBuffer(queue, d_precomputed, CL_TRUE, 0, filter_size * filter_size * sizeof(float), &precomputed[0], 0, nullptr, nullptr); + if(err != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to write to d_precomputed buffer\n"; + clReleaseMemObject(d_pbp); + clReleaseMemObject(d_precomputed); + clReleaseMemObject(d_filter_out); + clReleaseCommandQueue(queue); + return {}; + } + + /* + err = clEnqueueWriteBuffer(queue, d_pbp, CL_TRUE, 0, count * sizeof(int), &pbp_i[0], 0, nullptr, nullptr); + if(err != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to write to d_pbp buffer\n"; + clReleaseMemObject(d_pbp); + clReleaseMemObject(d_precomputed); + clReleaseMemObject(d_filter_out); + clReleaseCommandQueue(queue); + return {}; + } + */ + + kernel = clCreateKernel(program, "do_filter", &err); + if(err != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to create kernel: "; + switch(err) { + case CL_INVALID_PROGRAM: + std::cerr << "invalid program\n"; + break; + case CL_INVALID_PROGRAM_EXECUTABLE: + std::cerr << "invalid program executable\n"; + break; + case CL_INVALID_KERNEL_NAME: + std::cerr << "invalid kernel name\n"; + break; + case CL_INVALID_KERNEL_DEFINITION: + std::cerr << "invalid kernel definition\n"; + break; + case CL_INVALID_VALUE: + std::cerr << "invalid value\n"; + break; + case CL_OUT_OF_RESOURCES: + std::cerr << "out of resources\n"; + break; + case CL_OUT_OF_HOST_MEMORY: + std::cerr << "out of host memory\n"; + break; + default: + std::cerr << "unknown error\n"; + break; + } + clReleaseMemObject(d_pbp); + clReleaseMemObject(d_precomputed); + clReleaseMemObject(d_filter_out); + clReleaseCommandQueue(queue); + return {}; + } + + if(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_filter_out) != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to set kernel arg 0\n"; + clReleaseKernel(kernel); + clReleaseMemObject(d_pbp); + clReleaseMemObject(d_precomputed); + clReleaseMemObject(d_filter_out); + clReleaseCommandQueue(queue); + return {}; + } + if(clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_precomputed) != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to set kernel arg 1\n"; + clReleaseKernel(kernel); + clReleaseMemObject(d_pbp); + clReleaseMemObject(d_precomputed); + clReleaseMemObject(d_filter_out); + clReleaseCommandQueue(queue); + return {}; + } + if(clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_pbp) != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to set kernel arg 2\n"; + clReleaseKernel(kernel); + clReleaseMemObject(d_pbp); + clReleaseMemObject(d_precomputed); + clReleaseMemObject(d_filter_out); + clReleaseCommandQueue(queue); + return {}; + } + if(clSetKernelArg(kernel, 3, sizeof(int), &width) != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to set kernel arg 3\n"; + clReleaseKernel(kernel); + clReleaseMemObject(d_pbp); + clReleaseMemObject(d_precomputed); + clReleaseMemObject(d_filter_out); + clReleaseCommandQueue(queue); + return {}; + } + if(clSetKernelArg(kernel, 4, sizeof(int), &height) != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to set kernel arg 4\n"; + clReleaseKernel(kernel); + clReleaseMemObject(d_pbp); + clReleaseMemObject(d_precomputed); + clReleaseMemObject(d_filter_out); + clReleaseCommandQueue(queue); + return {}; + } + if(clSetKernelArg(kernel, 5, sizeof(int), &filter_size) != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to set kernel arg 4\n"; + clReleaseKernel(kernel); + clReleaseMemObject(d_pbp); + clReleaseMemObject(d_precomputed); + clReleaseMemObject(d_filter_out); + clReleaseCommandQueue(queue); + return {}; + } + + if(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(std::size_t), &local_size, nullptr) != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to get work group size\n"; + clReleaseKernel(kernel); + clReleaseMemObject(d_pbp); + clReleaseMemObject(d_precomputed); + clReleaseMemObject(d_filter_out); + clReleaseCommandQueue(queue); + return {}; + } + global_size = (std::size_t)std::ceil(count / (float)local_size) * local_size; + + std::cout << "OpenCL: global = " << global_size << ", local = " << local_size + << std::endl; + + std::vector filter(count); + + const auto get_filter = [&queue, &kernel, &global_size, &local_size, + &d_filter_out, &d_pbp, &pbp, &pbp_i, &count, &filter, &err] () -> bool { + for(unsigned int i = 0; i < pbp.size(); ++i) { + pbp_i[i] = pbp[i] ? 1 : 0; + } + if(clEnqueueWriteBuffer(queue, d_pbp, CL_TRUE, 0, count * sizeof(int), &pbp_i[0], 0, nullptr, nullptr) != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to write to d_pbp buffer\n"; + return false; + } + + if(err = clEnqueueNDRangeKernel( + queue, kernel, 1, nullptr, &global_size, &local_size, + 0, nullptr, nullptr); err != CL_SUCCESS) { + std::cerr << "OpenCL: Failed to enqueue task: "; + switch(err) { + case CL_INVALID_PROGRAM_EXECUTABLE: + std::cerr << "invalid program executable\n"; + break; + case CL_INVALID_COMMAND_QUEUE: + std::cerr << "invalid command queue\n"; + break; + case CL_INVALID_KERNEL: + std::cerr << "invalid kernel\n"; + break; + case CL_INVALID_CONTEXT: + std::cerr << "invalid context\n"; + break; + case CL_INVALID_KERNEL_ARGS: + std::cerr << "invalid kernel args\n"; + break; + case CL_INVALID_WORK_DIMENSION: + std::cerr << "invalid work dimension\n"; + break; + case CL_INVALID_GLOBAL_WORK_SIZE: + std::cerr << "invalid global work size\n"; + break; + case CL_INVALID_GLOBAL_OFFSET: + std::cerr << "invalid global offset\n"; + break; + case CL_INVALID_WORK_GROUP_SIZE: + std::cerr << "invalid work group size\n"; + break; + case CL_INVALID_WORK_ITEM_SIZE: + std::cerr << "invalid work item size\n"; + break; + case CL_MISALIGNED_SUB_BUFFER_OFFSET: + std::cerr << "misaligned sub buffer offset\n"; + break; + default: + std::cerr << "Unknown\n"; + break; + } + return false; + } + + clFinish(queue); + + clEnqueueReadBuffer(queue, d_filter_out, CL_TRUE, 0, count * sizeof(float), &filter[0], 0, nullptr, nullptr); + + return true; + }; + + { + printf("Inserting %d pixels into image of max count %d\n", pixel_count, count); + // generate image from randomized pbp + FILE *random_noise_image = fopen("random_noise.pbm", "w"); + fprintf(random_noise_image, "P1\n%d %d\n", width, height); + for(int y = 0; y < height; ++y) { + for(int x = 0; x < width; ++x) { + fprintf(random_noise_image, "%d ", pbp[internal::twoToOne(x, y, width)] ? 1 : 0); + } + fputc('\n', random_noise_image); + } + fclose(random_noise_image); + } + + if(!get_filter()) { + std::cerr << "OpenCL: Failed to execute do_filter (at start)\n"; + clReleaseKernel(kernel); + clReleaseMemObject(d_pbp); + clReleaseMemObject(d_precomputed); + clReleaseMemObject(d_filter_out); + clReleaseCommandQueue(queue); + return {}; + } else { + internal::write_filter(filter, width, "filter_out_start.pgm"); + } + + int iterations = 0; + + while(true) { + printf("Iteration %d\n", ++iterations); + + if(!get_filter()) { + std::cerr << "OpenCL: Failed to execute do_filter\n"; + break; + } + + int min, max, min_zero, max_one; + std::tie(min, max) = internal::filter_minmax(filter); + if(!pbp[max]) { + max_one = internal::get_one_or_zero(pbp, true, max, width, height); + } else { + max_one = max; + } + if(!pbp[max_one]) { + std::cerr << "ERROR: Failed to find pbp[max] one" << std::endl; + break; + } + + if(pbp[min]) { + min_zero = internal::get_one_or_zero(pbp, false, min, width, height); + } else { + min_zero = min; + } + if(pbp[min_zero]) { + std::cerr << "ERROR: Failed to find pbp[min] zero" << std::endl; + break; + } + + pbp[max_one] = false; + + if(!get_filter()) { + std::cerr << "OpenCL: Failed to execute do_filter\n"; + break; + } + + // get second buffer's min + int second_min; + std::tie(second_min, std::ignore) = internal::filter_minmax(filter); + if(pbp[second_min]) { + second_min = internal::get_one_or_zero(pbp, false, second_min, width, height); + if(pbp[second_min]) { + std::cerr << "ERROR: Failed to find pbp[second_min] zero" << std::endl; + break; + } + } + + if(internal::dist(max_one, second_min, width) < 1.5f) { + pbp[max_one] = true; + break; + } else { + pbp[min_zero] = true; + } + + if(iterations % 100 == 0) { + // generate blue_noise image from pbp + FILE *blue_noise_image = fopen("blue_noise.pbm", "w"); + fprintf(blue_noise_image, "P1\n%d %d\n", width, height); + for(int y = 0; y < height; ++y) { + for(int x = 0; x < width; ++x) { + fprintf(blue_noise_image, "%d ", pbp[internal::twoToOne(x, y, width)] ? 1 : 0); + } + fputc('\n', blue_noise_image); + } + fclose(blue_noise_image); + } + } + + if(!get_filter()) { + std::cerr << "OpenCL: Failed to execute do_filter (at end)\n"; + } else { + internal::write_filter(filter, width, "filter_out_final.pgm"); + FILE *blue_noise_image = fopen("blue_noise.pbm", "w"); + fprintf(blue_noise_image, "P1\n%d %d\n", width, height); + for(int y = 0; y < height; ++y) { + for(int x = 0; x < width; ++x) { + fprintf(blue_noise_image, "%d ", pbp[internal::twoToOne(x, y, width)] ? 1 : 0); + } + fputc('\n', blue_noise_image); + } + fclose(blue_noise_image); + } + + clReleaseKernel(kernel); + clReleaseMemObject(d_pbp); + clReleaseMemObject(d_precomputed); + clReleaseMemObject(d_filter_out); + clReleaseCommandQueue(queue); + return pbp; +} diff --git a/src/blue_noise.hpp b/src/blue_noise.hpp index fff6e1b..41ca383 100644 --- a/src/blue_noise.hpp +++ b/src/blue_noise.hpp @@ -12,12 +12,46 @@ #include #include #include +#include + +#include namespace dither { std::vector blue_noise(int width, int height, int threads = 1); namespace internal { + std::vector blue_noise_impl(int width, int height, int threads = 1); + std::vector blue_noise_cl_impl( + int width, int height, int filter_size, + cl_context context, cl_device_id device, cl_program program); + + inline std::vector random_noise(int size, int subsize) { + std::vector pbp(size); + std::default_random_engine re(std::random_device{}()); + std::uniform_int_distribution dist(0, size - 1); + + // initialize pbp + for(int i = 0; i < size; ++i) { + if(i < subsize) { + pbp[i] = true; + } else { + pbp[i] = false; + } + } + // randomize pbp + for(int i = 0; i < size-1; ++i) { + decltype(dist)::param_type range{i+1, size-1}; + int ridx = dist(re, range); + // probably can't use std::swap since using std::vector + bool temp = pbp[i]; + pbp[i] = pbp[ridx]; + pbp[ridx] = temp; + } + + return pbp; + } + inline int twoToOne(int x, int y, int width) { return x + y * width; } diff --git a/src/main.cpp b/src/main.cpp index d86e9c7..51e17aa 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -5,7 +5,7 @@ int main(int argc, char **argv) { //#ifndef NDEBUG std::cout << "Trying blue_noise..." << std::endl; - dither::blue_noise(70, 70, 8); + dither::blue_noise(100, 100, 8); //#endif return 0; -- 2.49.0