From ef7f623fb3af9f953c94d8ff8f8e327fa9d97166 Mon Sep 17 00:00:00 2001 From: Stephen Seo Date: Wed, 3 Apr 2024 17:55:37 +0900 Subject: [PATCH] Optimize buffer writes in vulkan_get_filter calls When only a single item in pbp buffer is changed, only update the single item in the staging buffer and the device buffer prior to Vulkan compute execution on buffers. --- src/blue_noise.cpp | 207 ++++++++++++++++++++++++++++++++++----------- src/blue_noise.hpp | 69 +++++++++++---- 2 files changed, 209 insertions(+), 67 deletions(-) diff --git a/src/blue_noise.cpp b/src/blue_noise.cpp index 85b99c7..df5a5cb 100644 --- a/src/blue_noise.cpp +++ b/src/blue_noise.cpp @@ -124,7 +124,8 @@ bool dither::internal::vulkan_create_buffer( void dither::internal::vulkan_copy_buffer(VkDevice device, VkCommandPool command_pool, VkQueue queue, VkBuffer src_buf, - VkBuffer dst_buf, VkDeviceSize size) { + VkBuffer dst_buf, VkDeviceSize size, + VkDeviceSize offset) { VkCommandBufferAllocateInfo alloc_info{}; alloc_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; @@ -142,6 +143,8 @@ void dither::internal::vulkan_copy_buffer(VkDevice device, VkBufferCopy copy_region{}; copy_region.size = size; + copy_region.srcOffset = offset; + copy_region.dstOffset = offset; vkCmdCopyBuffer(command_buf, src_buf, dst_buf, 1, ©_region); vkEndCommandBuffer(command_buf); @@ -157,6 +160,49 @@ void dither::internal::vulkan_copy_buffer(VkDevice device, vkFreeCommandBuffers(device, command_pool, 1, &command_buf); } +void dither::internal::vulkan_copy_buffer_pieces( + VkDevice device, VkCommandPool command_pool, VkQueue queue, + VkBuffer src_buf, VkBuffer dst_buf, + const std::vector> &pieces) { + VkCommandBufferAllocateInfo alloc_info{}; + alloc_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + alloc_info.commandPool = command_pool; + alloc_info.commandBufferCount = 1; + + VkCommandBuffer command_buf; + vkAllocateCommandBuffers(device, &alloc_info, &command_buf); + + VkCommandBufferBeginInfo begin_info{}; + begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + + vkBeginCommandBuffer(command_buf, &begin_info); + + std::vector regions; + for (auto tuple : pieces) { + VkBufferCopy copy_region{}; + copy_region.size = std::get<0>(tuple); + copy_region.srcOffset = std::get<1>(tuple); + copy_region.dstOffset = std::get<1>(tuple); + regions.push_back(copy_region); + } + vkCmdCopyBuffer(command_buf, src_buf, dst_buf, regions.size(), + regions.data()); + + vkEndCommandBuffer(command_buf); + + VkSubmitInfo submit_info{}; + submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &command_buf; + + vkQueueSubmit(queue, 1, &submit_info, VK_NULL_HANDLE); + vkQueueWaitIdle(queue); + + vkFreeCommandBuffers(device, command_pool, 1, &command_buf); +} + void dither::internal::vulkan_flush_buffer(VkDevice device, VkDeviceMemory memory) { VkMappedMemoryRange range{}; @@ -171,6 +217,40 @@ void dither::internal::vulkan_flush_buffer(VkDevice device, } } +void dither::internal::vulkan_flush_buffer_pieces( + VkDevice device, const VkDeviceSize phys_atom_size, VkDeviceMemory memory, + const std::vector> &pieces) { + std::vector ranges; + for (auto tuple : pieces) { + VkMappedMemoryRange range{}; + range.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; + range.pNext = nullptr; + range.memory = memory; + range.offset = std::get<1>(tuple); + range.size = std::get<0>(tuple); + + // TODO dynamically handle multiple pieces for more efficient flushes. + // This may not be necessary if pieces is always size 1. + + if (range.offset % phys_atom_size != 0) { + range.offset = (range.offset / phys_atom_size) * phys_atom_size; + } + + if (range.size < phys_atom_size) { + range.size = phys_atom_size; + } else if (range.size % phys_atom_size != 0) { + range.size = (range.size / phys_atom_size) * phys_atom_size; + } + + ranges.push_back(range); + } + + if (vkFlushMappedMemoryRanges(device, ranges.size(), ranges.data()) != + VK_SUCCESS) { + std::clog << "WARNING: vulkan_flush_buffer failed!\n"; + } +} + void dither::internal::vulkan_invalidate_buffer(VkDevice device, VkDeviceMemory memory) { VkMappedMemoryRange range{}; @@ -258,6 +338,15 @@ std::vector dither::internal::blue_noise_vulkan_impl( &staging_filter_buffer_mem); float *filter_mapped_float = (float *)filter_mapped; + std::vector changed_indices; + + VkDeviceSize phys_atom_size; + { + VkPhysicalDeviceProperties props; + vkGetPhysicalDeviceProperties(phys_device, &props); + phys_atom_size = props.limits.nonCoherentAtomSize; + } + { #ifndef NDEBUG printf("Inserting %d pixels into image of max count %d\n", pixel_count, @@ -276,12 +365,12 @@ std::vector dither::internal::blue_noise_vulkan_impl( #endif } - if (!vulkan_get_filter(device, command_buffer, command_pool, queue, pbp_buf, - pipeline, pipeline_layout, descriptor_set, - filter_out_buf, size, pbp, reversed_pbp, global_size, - pbp_mapped_int, staging_pbp_buffer, - staging_pbp_buffer_mem, staging_filter_buffer_mem, - staging_filter_buffer)) { + if (!vulkan_get_filter( + device, phys_atom_size, command_buffer, command_pool, queue, pbp_buf, + pipeline, pipeline_layout, descriptor_set, filter_out_buf, size, pbp, + reversed_pbp, global_size, pbp_mapped_int, staging_pbp_buffer, + staging_pbp_buffer_mem, staging_filter_buffer_mem, + staging_filter_buffer, nullptr)) { std::cerr << "Vulkan: Failed to execute get_filter at start!\n"; } else { #ifndef NDEBUG @@ -300,12 +389,13 @@ std::vector dither::internal::blue_noise_vulkan_impl( printf("Iteration %d\n", ++iterations); #endif - if (!vulkan_get_filter(device, command_buffer, command_pool, queue, pbp_buf, - pipeline, pipeline_layout, descriptor_set, - filter_out_buf, size, pbp, reversed_pbp, global_size, - pbp_mapped_int, staging_pbp_buffer, - staging_pbp_buffer_mem, staging_filter_buffer_mem, - staging_filter_buffer)) { + if (!vulkan_get_filter(device, phys_atom_size, command_buffer, command_pool, + queue, pbp_buf, pipeline, pipeline_layout, + descriptor_set, filter_out_buf, size, pbp, + reversed_pbp, global_size, pbp_mapped_int, + staging_pbp_buffer, staging_pbp_buffer_mem, + staging_filter_buffer_mem, staging_filter_buffer, + &changed_indices)) { std::cerr << "Vulkan: Failed to execute do_filter\n"; break; } @@ -316,12 +406,15 @@ std::vector dither::internal::blue_noise_vulkan_impl( pbp[max] = false; - if (!vulkan_get_filter(device, command_buffer, command_pool, queue, pbp_buf, - pipeline, pipeline_layout, descriptor_set, - filter_out_buf, size, pbp, reversed_pbp, global_size, - pbp_mapped_int, staging_pbp_buffer, - staging_pbp_buffer_mem, staging_filter_buffer_mem, - staging_filter_buffer)) { + changed_indices.push_back(max); + + if (!vulkan_get_filter(device, phys_atom_size, command_buffer, command_pool, + queue, pbp_buf, pipeline, pipeline_layout, + descriptor_set, filter_out_buf, size, pbp, + reversed_pbp, global_size, pbp_mapped_int, + staging_pbp_buffer, staging_pbp_buffer_mem, + staging_filter_buffer_mem, staging_filter_buffer, + &changed_indices)) { std::cerr << "Vulkan: Failed to execute do_filter\n"; break; } @@ -333,9 +426,11 @@ std::vector dither::internal::blue_noise_vulkan_impl( if (second_min == max) { pbp[max] = true; + changed_indices.push_back(max); break; } else { pbp[second_min] = true; + changed_indices.push_back(second_min); } #ifndef NDEBUG @@ -357,12 +452,12 @@ std::vector dither::internal::blue_noise_vulkan_impl( #endif } - if (!vulkan_get_filter(device, command_buffer, command_pool, queue, pbp_buf, - pipeline, pipeline_layout, descriptor_set, - filter_out_buf, size, pbp, reversed_pbp, global_size, - pbp_mapped_int, staging_pbp_buffer, - staging_pbp_buffer_mem, staging_filter_buffer_mem, - staging_filter_buffer)) { + if (!vulkan_get_filter( + device, phys_atom_size, command_buffer, command_pool, queue, pbp_buf, + pipeline, pipeline_layout, descriptor_set, filter_out_buf, size, pbp, + reversed_pbp, global_size, pbp_mapped_int, staging_pbp_buffer, + staging_pbp_buffer_mem, staging_filter_buffer_mem, + staging_filter_buffer, &changed_indices)) { std::cerr << "Vulkan: Failed to execute do_filter (at end)\n"; } else { #ifndef NDEBUG @@ -401,16 +496,17 @@ std::vector dither::internal::blue_noise_vulkan_impl( #ifndef NDEBUG std::cout << i << ' '; #endif - vulkan_get_filter(device, command_buffer, command_pool, queue, pbp_buf, - pipeline, pipeline_layout, descriptor_set, - filter_out_buf, size, pbp, reversed_pbp, global_size, - pbp_mapped_int, staging_pbp_buffer, + vulkan_get_filter(device, phys_atom_size, command_buffer, command_pool, + queue, pbp_buf, pipeline, pipeline_layout, + descriptor_set, filter_out_buf, size, pbp, reversed_pbp, + global_size, pbp_mapped_int, staging_pbp_buffer, staging_pbp_buffer_mem, staging_filter_buffer_mem, - staging_filter_buffer); + staging_filter_buffer, &changed_indices); std::tie(std::ignore, max) = internal::filter_minmax_raw_array(filter_mapped_float, size, pbp); pbp.at(max) = false; dither_array.at(max) = i; + changed_indices.push_back(max); #ifndef NDEBUG if (set.find(max) != set.end()) { std::cout << "\nWARNING: Reusing index " << max << '\n'; @@ -430,15 +526,17 @@ std::vector dither::internal::blue_noise_vulkan_impl( #ifndef NDEBUG std::cout << i << ' '; #endif - vulkan_get_filter(device, command_buffer, command_pool, queue, pbp_buf, - pipeline, pipeline_layout, descriptor_set, filter_out_buf, - size, pbp, reversed_pbp, global_size, pbp_mapped_int, - staging_pbp_buffer, staging_pbp_buffer_mem, - staging_filter_buffer_mem, staging_filter_buffer); + vulkan_get_filter(device, phys_atom_size, command_buffer, command_pool, + queue, pbp_buf, pipeline, pipeline_layout, descriptor_set, + filter_out_buf, size, pbp, reversed_pbp, global_size, + pbp_mapped_int, staging_pbp_buffer, + staging_pbp_buffer_mem, staging_filter_buffer_mem, + staging_filter_buffer, &changed_indices); std::tie(min, std::ignore) = internal::filter_minmax_raw_array(filter_mapped_float, size, pbp); pbp.at(min) = true; dither_array.at(min) = i; + changed_indices.push_back(min); #ifndef NDEBUG if (set.find(min) != set.end()) { std::cout << "\nWARNING: Reusing index " << min << '\n'; @@ -451,11 +549,12 @@ std::vector dither::internal::blue_noise_vulkan_impl( { image::Bl min_pixels = internal::rangeToBl(dither_array, width); min_pixels.writeToFile(image::file_type::PNG, true, "da_mid_pixels.png"); - vulkan_get_filter(device, command_buffer, command_pool, queue, pbp_buf, - pipeline, pipeline_layout, descriptor_set, filter_out_buf, - size, pbp, reversed_pbp, global_size, pbp_mapped_int, - staging_pbp_buffer, staging_pbp_buffer_mem, - staging_filter_buffer_mem, staging_filter_buffer); + vulkan_get_filter(device, phys_atom_size, command_buffer, command_pool, + queue, pbp_buf, pipeline, pipeline_layout, descriptor_set, + filter_out_buf, size, pbp, reversed_pbp, global_size, + pbp_mapped_int, staging_pbp_buffer, + staging_pbp_buffer_mem, staging_filter_buffer_mem, + staging_filter_buffer, &changed_indices); internal::write_filter(vulkan_buf_to_vec(filter_mapped_float, size), width, "filter_mid.pgm"); image::Bl pbp_image = toBl(pbp, width); @@ -464,19 +563,26 @@ std::vector dither::internal::blue_noise_vulkan_impl( #endif std::cout << "\nRanking last half of pixels...\n"; reversed_pbp = true; + bool first_reversed_run = true; for (unsigned int i = (size + 1) / 2; i < (unsigned int)size; ++i) { #ifndef NDEBUG std::cout << i << ' '; #endif - vulkan_get_filter(device, command_buffer, command_pool, queue, pbp_buf, - pipeline, pipeline_layout, descriptor_set, filter_out_buf, - size, pbp, reversed_pbp, global_size, pbp_mapped_int, - staging_pbp_buffer, staging_pbp_buffer_mem, - staging_filter_buffer_mem, staging_filter_buffer); + if (first_reversed_run) { + changed_indices.clear(); + first_reversed_run = false; + } + vulkan_get_filter(device, phys_atom_size, command_buffer, command_pool, + queue, pbp_buf, pipeline, pipeline_layout, descriptor_set, + filter_out_buf, size, pbp, reversed_pbp, global_size, + pbp_mapped_int, staging_pbp_buffer, + staging_pbp_buffer_mem, staging_filter_buffer_mem, + staging_filter_buffer, &changed_indices); std::tie(std::ignore, max) = internal::filter_minmax_raw_array(filter_mapped_float, size, pbp); pbp.at(max) = true; dither_array.at(max) = i; + changed_indices.push_back(max); #ifndef NDEBUG if (set.find(max) != set.end()) { std::cout << "\nWARNING: Reusing index " << max << '\n'; @@ -489,11 +595,12 @@ std::vector dither::internal::blue_noise_vulkan_impl( #ifndef NDEBUG { - vulkan_get_filter(device, command_buffer, command_pool, queue, pbp_buf, - pipeline, pipeline_layout, descriptor_set, filter_out_buf, - size, pbp, reversed_pbp, global_size, pbp_mapped_int, - staging_pbp_buffer, staging_pbp_buffer_mem, - staging_filter_buffer_mem, staging_filter_buffer); + vulkan_get_filter(device, phys_atom_size, command_buffer, command_pool, + queue, pbp_buf, pipeline, pipeline_layout, descriptor_set, + filter_out_buf, size, pbp, reversed_pbp, global_size, + pbp_mapped_int, staging_pbp_buffer, + staging_pbp_buffer_mem, staging_filter_buffer_mem, + staging_filter_buffer, nullptr); internal::write_filter(vulkan_buf_to_vec(filter_mapped_float, size), width, "filter_after.pgm"); image::Bl pbp_image = toBl(pbp, width); diff --git a/src/blue_noise.hpp b/src/blue_noise.hpp index daea77a..01d1887 100644 --- a/src/blue_noise.hpp +++ b/src/blue_noise.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -58,9 +59,16 @@ bool vulkan_create_buffer(VkDevice device, VkPhysicalDevice phys_dev, void vulkan_copy_buffer(VkDevice device, VkCommandPool command_pool, VkQueue queue, VkBuffer src_buf, VkBuffer dst_buf, - VkDeviceSize size); + VkDeviceSize size, VkDeviceSize offset = 0); +void vulkan_copy_buffer_pieces( + VkDevice device, VkCommandPool command_pool, VkQueue queue, + VkBuffer src_buf, VkBuffer dst_buf, + const std::vector > &pieces); void vulkan_flush_buffer(VkDevice device, VkDeviceMemory memory); +void vulkan_flush_buffer_pieces( + VkDevice device, const VkDeviceSize phys_atom_size, VkDeviceMemory memory, + const std::vector > &pieces); void vulkan_invalidate_buffer(VkDevice device, VkDeviceMemory memory); std::vector blue_noise_vulkan_impl( @@ -73,30 +81,57 @@ std::vector blue_noise_vulkan_impl( std::vector vulkan_buf_to_vec(float *mapped, unsigned int size); inline bool vulkan_get_filter( - VkDevice device, VkCommandBuffer command_buffer, VkCommandPool command_pool, - VkQueue queue, VkBuffer pbp_buf, VkPipeline pipeline, - VkPipelineLayout pipeline_layout, VkDescriptorSet descriptor_set, - VkBuffer filter_out_buf, const int size, std::vector &pbp, - bool reversed_pbp, const std::size_t global_size, int *pbp_mapped_int, - VkBuffer staging_pbp_buffer, VkDeviceMemory staging_pbp_buffer_mem, - VkDeviceMemory staging_filter_buffer_mem, VkBuffer staging_filter_buffer) { + VkDevice device, const VkDeviceSize phys_atom_size, + VkCommandBuffer command_buffer, VkCommandPool command_pool, VkQueue queue, + VkBuffer pbp_buf, VkPipeline pipeline, VkPipelineLayout pipeline_layout, + VkDescriptorSet descriptor_set, VkBuffer filter_out_buf, const int size, + std::vector &pbp, bool reversed_pbp, const std::size_t global_size, + int *pbp_mapped_int, VkBuffer staging_pbp_buffer, + VkDeviceMemory staging_pbp_buffer_mem, + VkDeviceMemory staging_filter_buffer_mem, VkBuffer staging_filter_buffer, + std::vector *changed) { vkResetCommandBuffer(command_buffer, 0); - if (reversed_pbp) { - for (unsigned int i = 0; i < pbp.size(); ++i) { - pbp_mapped_int[i] = pbp[i] ? 0 : 1; + if (changed != nullptr && changed->size() > 0) { + if (reversed_pbp) { + for (auto idx : *changed) { + pbp_mapped_int[idx] = pbp[idx] ? 0 : 1; + } + } else { + for (auto idx : *changed) { + pbp_mapped_int[idx] = pbp[idx] ? 1 : 0; + } } } else { - for (unsigned int i = 0; i < pbp.size(); ++i) { - pbp_mapped_int[i] = pbp[i] ? 1 : 0; + if (reversed_pbp) { + for (unsigned int i = 0; i < pbp.size(); ++i) { + pbp_mapped_int[i] = pbp[i] ? 0 : 1; + } + } else { + for (unsigned int i = 0; i < pbp.size(); ++i) { + pbp_mapped_int[i] = pbp[i] ? 1 : 0; + } } } - vulkan_flush_buffer(device, staging_pbp_buffer_mem); - // Copy pbp buffer. - vulkan_copy_buffer(device, command_pool, queue, staging_pbp_buffer, pbp_buf, - size * sizeof(int)); + if (changed != nullptr && changed->size() > 0) { + std::vector > pieces; + for (auto idx : *changed) { + pieces.emplace_back(std::make_tuple(sizeof(int), idx * sizeof(int))); + } + + vulkan_flush_buffer_pieces(device, phys_atom_size, staging_pbp_buffer_mem, + pieces); + + vulkan_copy_buffer_pieces(device, command_pool, queue, staging_pbp_buffer, + pbp_buf, pieces); + changed->clear(); + } else { + vulkan_flush_buffer(device, staging_pbp_buffer_mem); + vulkan_copy_buffer(device, command_pool, queue, staging_pbp_buffer, pbp_buf, + size * sizeof(int)); + } VkCommandBufferBeginInfo begin_info{}; begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;