Compare commits

..

10 commits

Author SHA1 Message Date
b88f3e9f4c Cleanup unused code
More cleanup of unused code.
2024-04-12 13:24:53 +09:00
4dbae98e8f Cleanup unused code
More cleanup of unused code.
2024-04-12 13:08:05 +09:00
e35a7e8720 Cleanup unused code
Remove unused code and do some cleanup.
2024-04-12 13:03:07 +09:00
997f72ca42 Impl. combine get_filter and minmax in Vk cmd buf
Includes fixes from bugs in previous commit.

Doesn't seem to improve the runtime much, so there may be more room for
optimization...
2024-04-11 13:38:06 +09:00
dfc78540db WIP combine Vulkan filter and min_max to cmd buf
This commit does some preparation for a new Vulkan compute shader
"blue_noise_filter.glsl". Note that every call of "filter" is followed
by a call to "min_max". The goal is to combine a single invocation of
Vulkan "filter" and log(n) invocations of Vulkan "min_max" in the same
command buffer, which may help with performance. This will be achieved
by passing the "max_in_buf" to the new "filter" compute shader, which
will hold the results of applying the precomputed-gaussian. This buffer
will then be copied to "min_in_buf", and then all is set to call the
Vulkan "min_max" compute shader log(n) times.

Note that log(n) comes from the fact that the Vulkan "min_max" compute
shader does a "reduce" on the input buffers where each SIMD invocation
compares two values and reduces it to 1. Doing this approximately log(n)
(log base 2) times will reduce the input gradually into a single minimum
and single maximum. This works due to having two separate "layouts" for
the same "min_max" shader where the "in" and "out" buffers are swapped
per "layout", and so by calling the other layout each time ensures that
the proper buffers are reduced. (This work has already been done. What's
left is to combine the "filter" and "min_max" Vulkan compute shaders
into the same Vulkan command buffer. But first, the actual setup for the
new Vulkan "filter" compute shader still has some work to do.)
2024-04-10 17:43:24 +09:00
728d872af4 Vulkan compute: move buffer init to before fn call 2024-04-03 18:23:46 +09:00
2e6f414baf Vulkan compute: resize max/min out buffers
Change max/min out buffers to have same size as max/min in buffers.
2024-04-03 18:22:14 +09:00
97cfcddfb3 Vulkan compute: minor refactoring 2024-04-03 18:22:14 +09:00
06115a7a2d Vulkan compute: combine all minmax calls
This commit combines the minmax execution via Vulkan compute. The
previous implementation executed compute in vulkan_minmax with a new
command buffer each time. This implementation combines all required
executions of compute in vulkan_minmax in a single command buffer and
uses a pipeline to ensure the enqueued compute calls stay in order.
2024-04-03 18:22:13 +09:00
52e6a09abd Do "minmax" on Vulkan compute
Was an attempt to speed up blue-noise-generation with Vulkan compute,
but this implementation seems to slow it down instead.
2024-04-03 18:20:22 +09:00
4 changed files with 988 additions and 430 deletions

File diff suppressed because it is too large Load diff

View file

@ -13,7 +13,6 @@
#include <cmath> #include <cmath>
#include <condition_variable> #include <condition_variable>
#include <cstdio> #include <cstdio>
#include <functional>
#include <iostream> #include <iostream>
#include <limits> #include <limits>
#include <mutex> #include <mutex>
@ -22,7 +21,6 @@
#include <stdexcept> #include <stdexcept>
#include <thread> #include <thread>
#include <tuple> #include <tuple>
#include <unordered_set>
#include <vector> #include <vector>
#include "image.hpp" #include "image.hpp"
@ -48,6 +46,12 @@ struct QueueFamilyIndices {
QueueFamilyIndices vulkan_find_queue_families(VkPhysicalDevice device); QueueFamilyIndices vulkan_find_queue_families(VkPhysicalDevice device);
struct FloatAndIndex {
float value;
int pbp;
int idx;
};
std::optional<uint32_t> vulkan_find_memory_type(VkPhysicalDevice phys_dev, std::optional<uint32_t> vulkan_find_memory_type(VkPhysicalDevice phys_dev,
uint32_t t_filter, uint32_t t_filter,
VkMemoryPropertyFlags props); VkMemoryPropertyFlags props);
@ -59,7 +63,8 @@ bool vulkan_create_buffer(VkDevice device, VkPhysicalDevice phys_dev,
void vulkan_copy_buffer(VkDevice device, VkCommandPool command_pool, void vulkan_copy_buffer(VkDevice device, VkCommandPool command_pool,
VkQueue queue, VkBuffer src_buf, VkBuffer dst_buf, VkQueue queue, VkBuffer src_buf, VkBuffer dst_buf,
VkDeviceSize size, VkDeviceSize offset = 0); VkDeviceSize size, VkDeviceSize src_offset = 0,
VkDeviceSize dst_offset = 0);
void vulkan_copy_buffer_pieces( void vulkan_copy_buffer_pieces(
VkDevice device, VkCommandPool command_pool, VkQueue queue, VkDevice device, VkCommandPool command_pool, VkQueue queue,
VkBuffer src_buf, VkBuffer dst_buf, VkBuffer src_buf, VkBuffer dst_buf,
@ -74,111 +79,28 @@ void vulkan_invalidate_buffer(VkDevice device, VkDeviceMemory memory);
std::vector<unsigned int> blue_noise_vulkan_impl( std::vector<unsigned int> blue_noise_vulkan_impl(
VkDevice device, VkPhysicalDevice phys_device, VkDevice device, VkPhysicalDevice phys_device,
VkCommandBuffer command_buffer, VkCommandPool command_pool, VkQueue queue, VkCommandBuffer command_buffer, VkCommandPool command_pool, VkQueue queue,
VkBuffer pbp_buf, VkPipeline pipeline, VkPipelineLayout pipeline_layout, VkPipeline minmax_pipeline, VkPipelineLayout minmax_pipeline_layout,
VkDescriptorSet descriptor_set, VkBuffer filter_out_buf, const int width, std::array<VkDescriptorSet, 2> minmax_descriptor_sets, VkBuffer max_in_buf,
const int height); VkBuffer min_in_buf, VkBuffer max_out_buf, VkBuffer min_out_buf,
const int width, const int height, VkBuffer minmax_staging_buf,
VkDeviceMemory minmax_staging_buf_mem, void *minmax_mapped,
VkPipeline filter_in_out_pipeline,
VkPipelineLayout filter_in_out_pipeline_layout,
VkDescriptorSet filter_in_out_desc_set);
std::vector<float> vulkan_buf_to_vec(float *mapped, unsigned int size); std::vector<float> vulkan_buf_to_vec(float *mapped, unsigned int size);
inline bool vulkan_get_filter( std::optional<std::pair<int, int>> vulkan_filter_and_minmax(
VkDevice device, const VkDeviceSize phys_atom_size, VkDevice device, VkCommandBuffer command_buffer, VkCommandPool command_pool,
VkCommandBuffer command_buffer, VkCommandPool command_pool, VkQueue queue, VkQueue queue, VkPipeline filter_in_out_pipeline,
VkBuffer pbp_buf, VkPipeline pipeline, VkPipelineLayout pipeline_layout, VkPipelineLayout filter_in_out_pipeline_layout,
VkDescriptorSet descriptor_set, VkBuffer filter_out_buf, const int size, VkDescriptorSet filter_in_out_desc_set, VkPipeline minmax_pipeline,
std::vector<bool> &pbp, bool reversed_pbp, const std::size_t global_size, VkPipelineLayout minmax_pipeline_layout,
int *pbp_mapped_int, VkBuffer staging_pbp_buffer, std::array<VkDescriptorSet, 2> minmax_desc_sets, VkBuffer max_in_buf,
VkDeviceMemory staging_pbp_buffer_mem, VkBuffer min_in_buf, VkBuffer max_out_buf, VkBuffer min_out_buf,
VkDeviceMemory staging_filter_buffer_mem, VkBuffer staging_filter_buffer, const int size, const std::size_t global_size, std::vector<bool> &pbp,
std::vector<std::size_t> *changed) { VkBuffer staging_buf, VkDeviceMemory staging_buf_mem, void *staging_mapped,
vkResetCommandBuffer(command_buffer, 0); VkDeviceSize phys_atom_size, bool reversed_pbp);
if (changed != nullptr && changed->size() > 0) {
if (reversed_pbp) {
for (auto idx : *changed) {
pbp_mapped_int[idx] = pbp[idx] ? 0 : 1;
}
} else {
for (auto idx : *changed) {
pbp_mapped_int[idx] = pbp[idx] ? 1 : 0;
}
}
} else {
if (reversed_pbp) {
for (unsigned int i = 0; i < pbp.size(); ++i) {
pbp_mapped_int[i] = pbp[i] ? 0 : 1;
}
} else {
for (unsigned int i = 0; i < pbp.size(); ++i) {
pbp_mapped_int[i] = pbp[i] ? 1 : 0;
}
}
}
// Copy pbp buffer.
if (changed != nullptr && changed->size() > 0) {
std::vector<std::tuple<VkDeviceSize, VkDeviceSize> > pieces;
for (auto idx : *changed) {
pieces.emplace_back(std::make_tuple(sizeof(int), idx * sizeof(int)));
}
vulkan_flush_buffer_pieces(device, phys_atom_size, staging_pbp_buffer_mem,
pieces);
vulkan_copy_buffer_pieces(device, command_pool, queue, staging_pbp_buffer,
pbp_buf, pieces);
changed->clear();
} else {
vulkan_flush_buffer(device, staging_pbp_buffer_mem);
vulkan_copy_buffer(device, command_pool, queue, staging_pbp_buffer, pbp_buf,
size * sizeof(int));
}
VkCommandBufferBeginInfo begin_info{};
begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
if (vkBeginCommandBuffer(command_buffer, &begin_info) != VK_SUCCESS) {
std::clog << "get_filter ERROR: Failed to begin recording compute "
"command buffer!\n";
return false;
}
vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
pipeline_layout, 0, 1, &descriptor_set, 0, nullptr);
vkCmdDispatch(command_buffer, global_size, 1, 1);
if (vkEndCommandBuffer(command_buffer) != VK_SUCCESS) {
std::clog << "get_filter ERROR: Failed to record compute command buffer!\n";
return false;
}
{
VkSubmitInfo submit_info{};
submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
submit_info.commandBufferCount = 1;
submit_info.pCommandBuffers = &command_buffer;
submit_info.signalSemaphoreCount = 0;
submit_info.pSignalSemaphores = nullptr;
if (vkQueueSubmit(queue, 1, &submit_info, nullptr) != VK_SUCCESS) {
std::clog
<< "get_filter ERROR: Failed to submit compute command buffer!\n";
return false;
}
}
if (vkDeviceWaitIdle(device) != VK_SUCCESS) {
std::clog << "get_filter ERROR: Failed to vkDeviceWaitIdle!\n";
return false;
}
// Copy back filter_out buffer.
vulkan_copy_buffer(device, command_pool, queue, filter_out_buf,
staging_filter_buffer, size * sizeof(float));
vulkan_invalidate_buffer(device, staging_filter_buffer_mem);
return true;
}
#endif #endif

View file

@ -1,5 +1,11 @@
#version 450 #version 450
struct FloatAndIndex {
float value;
int pbp;
int idx;
};
int twoToOne(int x, int y, int width, int height) { int twoToOne(int x, int y, int width, int height) {
while (x < 0) { while (x < 0) {
x += width; x += width;
@ -14,11 +20,9 @@ int twoToOne(int x, int y, int width, int height) {
layout(binding = 0) readonly buffer PreComputed { float precomputed[]; }; layout(binding = 0) readonly buffer PreComputed { float precomputed[]; };
layout(binding = 1) writeonly buffer FilterOut { float filter_out[]; }; layout(binding = 1) buffer FilterInOut { FloatAndIndex filter_in_out[]; };
layout(binding = 2) readonly buffer PBP { int pbp[]; }; layout(binding = 2) readonly buffer Other {
layout(binding = 3) readonly buffer Other {
int width; int width;
int height; int height;
int filter_size; int filter_size;
@ -32,19 +36,20 @@ void main() {
return; return;
} }
// filter_in_out[index].idx = int(index);
int x = int(index % width); int x = int(index % width);
int y = int(index / width); int y = int(index / width);
float sum = 0.0F; filter_in_out[index].value = 0.0F;
for (int q = 0; q < filter_size; ++q) { for (int q = 0; q < filter_size; ++q) {
int q_prime = height - filter_size / 2 + y + q; int q_prime = height - filter_size / 2 + y + q;
for (int p = 0; p < filter_size; ++p) { for (int p = 0; p < filter_size; ++p) {
int p_prime = width - filter_size / 2 + x + p; int p_prime = width - filter_size / 2 + x + p;
if (pbp[twoToOne(p_prime, q_prime, width, height)] != 0) { if (filter_in_out[twoToOne(p_prime, q_prime, width, height)].pbp != 0) {
sum += precomputed[twoToOne(p, q, filter_size, filter_size)]; filter_in_out[index].value +=
precomputed[twoToOne(p, q, filter_size, filter_size)];
} }
} }
} }
filter_out[index] = sum;
} }

View file

@ -0,0 +1,54 @@
#version 450
struct FloatAndIndex {
float value;
int pbp;
int idx;
};
layout(binding = 0) readonly buffer MaxIn { FloatAndIndex max_in[]; };
layout(binding = 1) readonly buffer MinIn { FloatAndIndex min_in[]; };
layout(binding = 2) writeonly buffer MaxOut { FloatAndIndex max_out[]; };
layout(binding = 3) writeonly buffer MinOut { FloatAndIndex min_out[]; };
layout(binding = 4) readonly buffer State { int size; };
layout(local_size_x = 256) in;
void main() {
uint index = gl_GlobalInvocationID.x;
if (index >= (size + 1) / 2) {
return;
}
if (index * 2 + 1 < size) {
if (max_in[index * 2].pbp != 0 && max_in[index * 2 + 1].pbp != 0) {
if (max_in[index * 2].value > max_in[index * 2 + 1].value) {
max_out[index] = max_in[index * 2];
} else {
max_out[index] = max_in[index * 2 + 1];
}
} else if (max_in[index * 2].pbp != 0 && max_in[index * 2 + 1].pbp == 0) {
max_out[index] = max_in[index * 2];
} else {
max_out[index] = max_in[index * 2 + 1];
}
if (min_in[index * 2].pbp == 0 && min_in[index * 2 + 1].pbp == 0) {
if (min_in[index * 2].value < min_in[index * 2 + 1].value) {
min_out[index] = min_in[index * 2];
} else {
min_out[index] = min_in[index * 2 + 1];
}
} else if (min_in[index * 2].pbp == 0 && min_in[index * 2 + 1].pbp != 0) {
min_out[index] = min_in[index * 2];
} else {
min_out[index] = min_in[index * 2 + 1];
}
} else {
max_out[index] = max_in[index * 2];
max_out[index + 1].pbp = 0;
min_out[index] = min_in[index * 2];
min_out[index + 1].pbp = 1;
}
}