diff --git a/src/hid_core/resources/npad/npad.cpp b/src/hid_core/resources/npad/npad.cpp index 8e309238d2..ab5a3f7823 100644 --- a/src/hid_core/resources/npad/npad.cpp +++ b/src/hid_core/resources/npad/npad.cpp @@ -50,8 +50,8 @@ NPad::NPad(Core::HID::HIDCore& hid_core_, KernelHelpers::ServiceContext& service auto& controller = controller_data[aruid_index][i]; controller.device = hid_core.GetEmulatedControllerByIndex(i); Core::HID::ControllerUpdateCallback engine_callback{ - .on_change = [this, i](Core::HID::ControllerTriggerType type) { - ControllerUpdate(hid_core.kernel, type, i); + .on_change = [this, i, kernel = &hid_core.kernel](Core::HID::ControllerTriggerType type) { + ControllerUpdate(*kernel, type, i); }, .is_npad_service = true, }; diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp index 705f20850a..ebc5a825dd 100644 --- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp +++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp @@ -236,8 +236,11 @@ void LowerGeometryPassthrough(const IR::Program& program, const HostTranslateInf IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool& block_pool, Environment& env, Flow::CFG& cfg, const HostTranslateInfo& host_info) { + HostTranslateInfo normalized_host_info{host_info}; + normalized_host_info.ApplyDescriptorLimitPolicy(); + IR::Program program; - program.syntax_list = BuildASL(inst_pool, block_pool, env, cfg, host_info); + program.syntax_list = BuildASL(inst_pool, block_pool, env, cfg, normalized_host_info); program.blocks = GenerateBlocks(program.syntax_list); program.post_order_blocks = PostOrder(program.syntax_list.front()); program.stage = env.ShaderStage(); @@ -260,9 +263,9 @@ IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool> (i % 32)) & 1) == 0; } - if (!host_info.support_geometry_shader_passthrough) { + if (!normalized_host_info.support_geometry_shader_passthrough) { program.output_vertices = GetOutputTopologyVertices(program.output_topology); - LowerGeometryPassthrough(program, host_info); + LowerGeometryPassthrough(program, normalized_host_info); } } break; @@ -277,16 +280,16 @@ IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool& inst_pool, ObjectPool& inst_pool, ObjectPool; constexpr u32 DESCRIPTOR_SIZE = 8; -constexpr u32 DESCRIPTOR_SIZE_SHIFT = static_cast(std::countr_zero(DESCRIPTOR_SIZE)); -constexpr u32 DYNAMIC_DESCRIPTOR_CBUF_BYTES = 16 * 1024; -constexpr u32 MAX_DYNAMIC_DESCRIPTOR_COUNT = 1024; +constexpr u32 DESCRIPTOR_SIZE_SHIFT = u32(std::countr_zero(DESCRIPTOR_SIZE)); +constexpr u32 DESCRIPTOR_MAX_COUNT = 1024; u32 DynamicDescriptorSizeShift(const IR::U32& dynamic_offset) { - const IR::Inst* const inst{dynamic_offset.InstRecursive()}; - if (!inst || inst->GetOpcode() != IR::Opcode::ShiftLeftLogical32) { + const IR::Inst* const inst = dynamic_offset.InstRecursive(); + if (!inst || inst->GetOpcode() != IR::Opcode::ShiftLeftLogical32) return DESCRIPTOR_SIZE_SHIFT; - } - const IR::Value shift{inst->Arg(1)}; - if (!shift.IsImmediate()) { + const IR::Value shift = inst->Arg(1); + if (!shift.IsImmediate()) return DESCRIPTOR_SIZE_SHIFT; - } - const u32 size_shift{shift.U32()}; - return size_shift >= DESCRIPTOR_SIZE_SHIFT && size_shift < 31 ? size_shift - : DESCRIPTOR_SIZE_SHIFT; + const u32 size_shift = shift.U32(); + return size_shift >= DESCRIPTOR_SIZE_SHIFT && size_shift < 31 ? size_shift : DESCRIPTOR_SIZE_SHIFT; } -u32 DynamicDescriptorCount(u32 base_offset, u32 size_shift) { - if (size_shift >= 31 || base_offset >= DYNAMIC_DESCRIPTOR_CBUF_BYTES) { +u32 DynamicDescriptorCount(u32 base_offset, u32 size_shift, u32 max_descriptors) { + auto const descriptor_limit = (std::max)(1U, max_descriptors); + auto const max_cbuf_bytes = 16 * descriptor_limit; + if (size_shift >= 31 || base_offset >= max_cbuf_bytes) return 1; - } - const u32 stride{1U << size_shift}; - const u32 available{DYNAMIC_DESCRIPTOR_CBUF_BYTES - base_offset}; - if (available < DESCRIPTOR_SIZE) { + auto const stride = 1U << size_shift; + auto const available = max_cbuf_bytes - base_offset; + if (available < DESCRIPTOR_SIZE) return 1; - } - const u32 available_count{1U + (available - DESCRIPTOR_SIZE) / stride}; - return std::min(MAX_DYNAMIC_DESCRIPTOR_COUNT, available_count); + auto const available_count = 1U + (available - DESCRIPTOR_SIZE) / stride; + return std::min(descriptor_limit, available_count); } u32 SaturatingSub(u32 lhs, u32 rhs) { return lhs > rhs ? lhs - rhs : 0; } -template -u32 StaticDescriptorCount(const Descriptors& descriptors) { - u32 count{}; - for (const auto& desc : descriptors) { - if (desc.count <= 1) { - count += desc.count; - } - } - return count; +template +[[nodiscard]] u32 StaticDescriptorCount(T const& descriptors) noexcept { + return std::accumulate(descriptors.cbegin(), descriptors.cend(), 0U, [](auto const& acc, auto const& e) { + return acc + (e.count <= 1 ? e.count : 0); + }); } -u32 DynamicSampledTextureCap(const Info& info, const HostTranslateInfo& host_info, - u32 dynamic_arrays) { - if (dynamic_arrays == 0) { - return MAX_DYNAMIC_DESCRIPTOR_COUNT; +u32 DynamicSampledTextureCap(const Info& info, const HostTranslateInfo& host_info, u32 dynamic_arrays) { + auto const sampled_limit = (std::max)(1U, std::min(host_info.max_per_stage_descriptor_sampled_images, + host_info.max_descriptor_set_sampled_images)); + auto const resource_limit = (std::max)(1U, host_info.max_per_stage_resources); + if (dynamic_arrays > 0) { + auto const sampled_static_count = StaticDescriptorCount(info.texture_buffer_descriptors) + StaticDescriptorCount(info.texture_descriptors); + auto const resource_static_count = + NumDescriptors(info.constant_buffer_descriptors) + + NumDescriptors(info.storage_buffers_descriptors) + + sampled_static_count + NumDescriptors(info.image_buffer_descriptors) + + NumDescriptors(info.image_descriptors); + auto const sampled_budget = SaturatingSub(sampled_limit, sampled_static_count); + auto const resource_budget = SaturatingSub(resource_limit, resource_static_count); + auto const sampled_cap = sampled_budget / dynamic_arrays; + auto const resource_cap = resource_budget / dynamic_arrays; + return (std::max)(1U, (std::min)(sampled_cap, resource_cap)); } - const u32 sampled_static_count{StaticDescriptorCount(info.texture_buffer_descriptors) + - StaticDescriptorCount(info.texture_descriptors)}; - const u32 resource_static_count{ - NumDescriptors(info.constant_buffer_descriptors) + - NumDescriptors(info.storage_buffers_descriptors) + sampled_static_count + - NumDescriptors(info.image_buffer_descriptors) + NumDescriptors(info.image_descriptors)}; - const u32 sampled_limit{std::min(host_info.max_per_stage_descriptor_sampled_images, - host_info.max_descriptor_set_sampled_images)}; - const u32 sampled_budget{SaturatingSub(sampled_limit, sampled_static_count)}; - const u32 resource_budget{SaturatingSub(host_info.max_per_stage_resources, - resource_static_count)}; - const u32 sampled_cap{sampled_budget / dynamic_arrays}; - const u32 resource_cap{resource_budget / dynamic_arrays}; - return std::max(1U, std::min({MAX_DYNAMIC_DESCRIPTOR_COUNT, sampled_cap, resource_cap})); + return (std::min)({DESCRIPTOR_MAX_COUNT, sampled_limit, resource_limit}); } IR::Opcode IndexedInstruction(const IR::Inst& inst) { @@ -304,21 +295,23 @@ static inline bool IsTexturePixelFormatIntegerCached(Environment& env, } -std::optional Track(const IR::Value& value, Environment& env); -static inline std::optional TrackCached(const IR::Value& v, Environment& env) { +std::optional Track(const IR::Value& value, Environment& env, const HostTranslateInfo& host_info); +static inline std::optional TrackCached(const IR::Value& v, Environment& env, const HostTranslateInfo& host_info) { if (const IR::Inst* key = v.InstRecursive()) { if (auto it = env.track_cache.find(key); it != env.track_cache.end()) return it->second; - auto found = Track(v, env); + auto found = Track(v, env, host_info); if (found) env.track_cache.emplace(key, *found); return found; } - return Track(v, env); + return Track(v, env, host_info); } -std::optional TryGetConstBuffer(const IR::Inst* inst, Environment& env); +std::optional TryGetConstBuffer(const IR::Inst* inst, Environment& env, const HostTranslateInfo& host_info); -std::optional Track(const IR::Value& value, Environment& env) { - return IR::BreadthFirstSearch(value, [&env](const IR::Inst* inst) { return TryGetConstBuffer(inst, env); }); +std::optional Track(const IR::Value& value, Environment& env, const HostTranslateInfo& host_info) { + return IR::BreadthFirstSearch(value, [&env, &host_info](const IR::Inst* inst) { + return TryGetConstBuffer(inst, env, host_info); + }); } std::optional TryGetConstant(IR::Value& value, Environment& env) { @@ -342,13 +335,13 @@ std::optional TryGetConstant(IR::Value& value, Environment& env) { return ReadCbufCached(env, index_number, offset_number); } -std::optional TryGetConstBuffer(const IR::Inst* inst, Environment& env) { +std::optional TryGetConstBuffer(const IR::Inst* inst, Environment& env, const HostTranslateInfo& host_info) { switch (inst->GetOpcode()) { default: return std::nullopt; case IR::Opcode::BitwiseOr32: { - std::optional lhs{TrackCached(inst->Arg(0), env)}; - std::optional rhs{TrackCached(inst->Arg(1), env)}; + std::optional lhs{TrackCached(inst->Arg(0), env, host_info)}; + std::optional rhs{TrackCached(inst->Arg(1), env, host_info)}; if (!lhs || !rhs) { return std::nullopt; } @@ -378,12 +371,11 @@ std::optional TryGetConstBuffer(const IR::Inst* inst, Environme if (!shift.IsImmediate()) { return std::nullopt; } - std::optional lhs{TrackCached(inst->Arg(0), env)}; + std::optional lhs{TrackCached(inst->Arg(0), env, host_info)}; if (lhs) { lhs->shift_left = shift.U32(); } return lhs; - break; } case IR::Opcode::BitwiseAnd32: { IR::Value op1{inst->Arg(0)}; @@ -407,7 +399,7 @@ std::optional TryGetConstBuffer(const IR::Inst* inst, Environme return std::nullopt; } while (false); } - std::optional lhs{TrackCached(op1, env)}; + std::optional lhs{TrackCached(op1, env, host_info)}; if (lhs) { lhs->shift_left = static_cast(std::countr_zero(op2.U32())); } @@ -453,7 +445,10 @@ std::optional TryGetConstBuffer(const IR::Inst* inst, Environme } else { return std::nullopt; } - const u32 size_shift{DynamicDescriptorSizeShift(dynamic_offset)}; + auto const size_shift = DynamicDescriptorSizeShift(dynamic_offset); + auto const sampled_limit = (std::max)(1U, (std::min)(host_info.max_per_stage_descriptor_sampled_images, + host_info.max_descriptor_set_sampled_images)); + auto const resource_limit = (std::max)(1U, host_info.max_per_stage_resources); return ConstBufferAddr{ .index = index.U32(), .offset = base_offset, @@ -462,15 +457,15 @@ std::optional TryGetConstBuffer(const IR::Inst* inst, Environme .secondary_offset = 0, .secondary_shift_left = 0, .dynamic_offset = dynamic_offset, - .count = DynamicDescriptorCount(base_offset, size_shift), + .count = DynamicDescriptorCount(base_offset, size_shift, (std::min)({DESCRIPTOR_MAX_COUNT, sampled_limit, resource_limit})), .has_secondary = false, }; } -TextureInst MakeInst(Environment& env, IR::Block* block, IR::Inst& inst) { +TextureInst MakeInst(Environment& env, IR::Block* block, IR::Inst& inst, const HostTranslateInfo& host_info) { ConstBufferAddr addr; if (IsBindless(inst)) { - const std::optional track_addr{TrackCached(inst.Arg(0), env)}; + const std::optional track_addr{TrackCached(inst.Arg(0), env, host_info)}; if (!track_addr) { throw NotImplementedException("Failed to track bindless texture constant buffer"); @@ -506,15 +501,15 @@ u32 GetTextureHandle(Environment& env, const ConstBufferAddr& cbuf) { return lhs_raw | rhs_raw; } - [[maybe_unused]]TextureType ReadTextureType(Environment& env, const ConstBufferAddr& cbuf) { +[[maybe_unused]] TextureType ReadTextureType(Environment& env, const ConstBufferAddr& cbuf) { return env.ReadTextureType(GetTextureHandle(env, cbuf)); } - [[maybe_unused]]TexturePixelFormat ReadTexturePixelFormat(Environment& env, const ConstBufferAddr& cbuf) { +[[maybe_unused]] TexturePixelFormat ReadTexturePixelFormat(Environment& env, const ConstBufferAddr& cbuf) { return env.ReadTexturePixelFormat(GetTextureHandle(env, cbuf)); } - [[maybe_unused]]bool IsTexturePixelFormatInteger(Environment& env, const ConstBufferAddr& cbuf) { +[[maybe_unused]] bool IsTexturePixelFormatInteger(Environment& env, const ConstBufferAddr& cbuf) { return env.IsTexturePixelFormatInteger(GetTextureHandle(env, cbuf)); } @@ -675,7 +670,7 @@ void TexturePass(Environment& env, IR::Program& program, const HostTranslateInfo if (!IsTextureInstruction(inst)) { continue; } - to_replace.push_back(MakeInst(env, block, inst)); + to_replace.push_back(MakeInst(env, block, inst, host_info)); } } // Sort instructions to visit textures by constant buffer index, then by offset @@ -689,8 +684,7 @@ void TexturePass(Environment& env, IR::Program& program, const HostTranslateInfo program.info.texture_descriptors, program.info.image_descriptors, }; - const u32 sampled_dynamic_cap{ - DynamicSampledTextureCap(program.info, host_info, DynamicSampledTextureArrayCount(to_replace))}; + const u32 sampled_dynamic_cap = DynamicSampledTextureCap(program.info, host_info, DynamicSampledTextureArrayCount(to_replace)); for (TextureInst& texture_inst : to_replace) { // TODO: Handle arrays IR::Inst* const inst{texture_inst.inst}; diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h index ff19f0710f..bd7bc6ac7b 100644 --- a/src/shader_recompiler/profile.h +++ b/src/shader_recompiler/profile.h @@ -92,7 +92,6 @@ struct Profile { bool has_broken_robust{}; u64 min_ssbo_alignment{}; - u32 max_user_clip_distances{}; }; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index bc88796e3f..abfe4554f2 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -245,16 +245,31 @@ ShaderCache::ShaderCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, std::min(device.GetMaxUserClipDistances(), Maxwell::Regs::NumClipDistances), }, host_info{ - .support_float64 = true, - .support_float16 = false, - .support_int64 = device.HasShaderInt64(), - .needs_demote_reorder = device.IsAmd(), - .support_snorm_render_buffer = false, - .support_viewport_index_layer = device.HasVertexViewportLayer(), - .min_ssbo_alignment = static_cast(device.GetShaderStorageBufferAlignment()), - .support_geometry_shader_passthrough = device.HasGeometryShaderPassthrough(), - .support_conditional_barrier = device.SupportsConditionalBarriers(), + .min_ssbo_alignment = static_cast(device.GetShaderStorageBufferAlignment()), + .max_per_stage_descriptor_sampled_images = + Shader::HostTranslateInfo::DEFAULT_DESCRIPTOR_LIMIT, + .max_per_stage_resources = Shader::HostTranslateInfo::DEFAULT_DESCRIPTOR_LIMIT, + .max_descriptor_set_samplers = Shader::HostTranslateInfo::DEFAULT_DESCRIPTOR_LIMIT, + .max_descriptor_set_uniform_buffers = Shader::HostTranslateInfo::DEFAULT_DESCRIPTOR_LIMIT, + .max_descriptor_set_uniform_buffers_dynamic = + Shader::HostTranslateInfo::DEFAULT_DESCRIPTOR_LIMIT, + .max_descriptor_set_storage_buffers = Shader::HostTranslateInfo::DEFAULT_DESCRIPTOR_LIMIT, + .max_descriptor_set_storage_buffers_dynamic = + Shader::HostTranslateInfo::DEFAULT_DESCRIPTOR_LIMIT, + .max_descriptor_set_sampled_images = Shader::HostTranslateInfo::DEFAULT_DESCRIPTOR_LIMIT, + .max_descriptor_set_storage_images = Shader::HostTranslateInfo::DEFAULT_DESCRIPTOR_LIMIT, + .max_descriptor_set_input_attachements = + Shader::HostTranslateInfo::DEFAULT_DESCRIPTOR_LIMIT, + .support_float64 = true, + .support_float16 = false, + .support_int64 = device.HasShaderInt64(), + .needs_demote_reorder = device.IsAmd(), + .support_snorm_render_buffer = false, + .support_viewport_index_layer = device.HasVertexViewportLayer(), + .support_geometry_shader_passthrough = device.HasGeometryShaderPassthrough(), + .support_conditional_barrier = device.SupportsConditionalBarriers(), } { + host_info.ApplyDescriptorLimitPolicy(); if (use_asynchronous_shaders) { workers = CreateWorkers(); } diff --git a/src/video_core/renderer_vulkan/pipeline_helper.h b/src/video_core/renderer_vulkan/pipeline_helper.h index 451a02c7e6..35c9379987 100644 --- a/src/video_core/renderer_vulkan/pipeline_helper.h +++ b/src/video_core/renderer_vulkan/pipeline_helper.h @@ -22,6 +22,15 @@ namespace Vulkan { using Shader::Backend::SPIRV::NUM_TEXTURE_AND_IMAGE_SCALING_WORDS; +[[nodiscard]] inline u32 NumDescriptorEntries(const Shader::Info& info) { + return Shader::NumDescriptors(info.constant_buffer_descriptors) + + Shader::NumDescriptors(info.storage_buffers_descriptors) + + Shader::NumDescriptors(info.texture_buffer_descriptors) + + Shader::NumDescriptors(info.image_buffer_descriptors) + + Shader::NumDescriptors(info.texture_descriptors) + + Shader::NumDescriptors(info.image_descriptors); +} + class DescriptorLayoutBuilder { public: DescriptorLayoutBuilder(const Device& device_) : device{&device_} {} diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index c24b7a5757..471cbbb0df 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -326,7 +326,7 @@ std::pair Uint8Pass::Assemble(u32 num_vertices, VkBuffer const u32 staging_size = static_cast(num_vertices * sizeof(u16)); const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); - compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.Acquire(scheduler, 2); compute_pass_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices); compute_pass_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size); const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; @@ -384,7 +384,7 @@ std::pair QuadIndexedPass::Assemble( const std::size_t staging_size = num_tri_vertices * sizeof(u32); const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); - compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.Acquire(scheduler, 2); compute_pass_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size); compute_pass_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size); const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; @@ -429,7 +429,7 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_ } const size_t compare_size = compare_to_zero ? 8 : 24; - compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.Acquire(scheduler, 2); compute_pass_descriptor_queue.AddBuffer(src_buffer, src_offset, compare_size); compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, sizeof(u32)); const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; @@ -498,7 +498,7 @@ void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffe static constexpr size_t DISPATCH_SIZE = 2048U; size_t runs_to_do = std::min(current_runs, DISPATCH_SIZE); current_runs -= runs_to_do; - compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.Acquire(scheduler, 3); compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, number_of_sums * sizeof(u64)); compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, number_of_sums * sizeof(u64)); compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); @@ -600,7 +600,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U); const u32 num_dispatches_z = image.info.resources.layers; - compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.Acquire(scheduler, 2); compute_pass_descriptor_queue.AddBuffer(map.buffer, input_offset, image.guest_size_bytes - swizzle.buffer_offset); compute_pass_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); @@ -821,7 +821,7 @@ void BlockLinearUnswizzle3DPass::UnswizzleChunk( pc.blocks_dim[1] = blocks_y; pc.blocks_dim[2] = z_count; // Only process the count - compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.Acquire(scheduler, 3); compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0, image.runtime->swizzle_table_size); compute_pass_descriptor_queue.AddBuffer(swizzled.buffer, @@ -989,7 +989,7 @@ void MSAACopyPass::CopyImage(Image& dst_image, Image& src_image, ASSERT(copy.dst_subresource.base_layer == 0); ASSERT(copy.dst_subresource.num_layers == 1); - compute_pass_descriptor_queue.Acquire(); + compute_pass_descriptor_queue.Acquire(scheduler, 2); compute_pass_descriptor_queue.AddImage( src_image.StorageImageView(copy.src_subresource.base_level)); compute_pass_descriptor_queue.AddImage( diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 46a3b4bcb8..13bd654a80 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -45,6 +45,7 @@ ComputePipeline::ComputePipeline(const Device& device_, Scheduler& scheduler, vk } std::copy_n(info.constant_buffer_used_sizes.begin(), uniform_buffer_sizes.size(), uniform_buffer_sizes.begin()); + num_descriptor_entries = NumDescriptorEntries(info); auto func{[this, &scheduler, &descriptor_pool, shader_notify, pipeline_statistics] { DescriptorLayoutBuilder builder{device}; @@ -113,7 +114,7 @@ ComputePipeline::ComputePipeline(const Device& device_, Scheduler& scheduler, vk void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute, Tegra::MemoryManager& gpu_memory, Scheduler& scheduler, BufferCache& buffer_cache, TextureCache& texture_cache) { - guest_descriptor_queue.Acquire(); + guest_descriptor_queue.Acquire(scheduler, num_descriptor_entries); buffer_cache.SetComputeUniformBufferState(info.constant_buffer_mask, &uniform_buffer_sizes); buffer_cache.UnbindComputeStorageBuffers(); diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h index f3abe4c931..1feeed4840 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h @@ -53,6 +53,7 @@ private: vk::PipelineCache& pipeline_cache; GuestDescriptorQueue& guest_descriptor_queue; Shader::Info info; + u32 num_descriptor_entries{}; VideoCommon::ComputeUniformBufferSizes uniform_buffer_sizes{}; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 01495b4515..0c0174391c 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -268,6 +268,7 @@ GraphicsPipeline::GraphicsPipeline( num_textures += Shader::NumDescriptors(info->texture_descriptors); num_image_elements += Shader::NumDescriptors(info->texture_descriptors); num_image_elements += Shader::NumDescriptors(info->image_descriptors); + num_descriptor_entries += NumDescriptorEntries(*info); } fragment_has_color0_output = stage_infos[NUM_STAGES - 1].stores_frag_color[0]; auto func{[this, shader_notify, &render_pass_cache, &descriptor_pool, pipeline_statistics] { @@ -473,7 +474,7 @@ bool GraphicsPipeline::ConfigureImpl(bool is_indexed) { buffer_cache.UpdateGraphicsBuffers(is_indexed); buffer_cache.BindHostGeometryBuffers(is_indexed); - guest_descriptor_queue.Acquire(); + guest_descriptor_queue.Acquire(scheduler, num_descriptor_entries); RescalingPushConstant rescaling; RenderAreaPushConstant render_area; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 1a41e50a36..d1caeaee8e 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -159,6 +159,7 @@ private: std::array stage_infos; std::array enabled_uniform_buffer_masks{}; VideoCommon::UniformBufferSizes uniform_buffer_sizes{}; + u32 num_descriptor_entries{}; size_t num_image_elements{}; u32 num_textures{}; bool fragment_has_color0_output{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index ad7bfd0718..15f7e9bf43 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -439,10 +439,21 @@ PipelineCache::PipelineCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, .has_broken_robust = device.IsNvidia() && device.GetNvidiaArch() <= NvidiaArchitecture::Arch_Pascal, .min_ssbo_alignment = device.GetStorageBufferAlignment(), - .max_user_clip_distances = device.GetMaxUserClipDistances(), + .max_user_clip_distances = device.GetMaxUserClipDistances() }; host_info = Shader::HostTranslateInfo{ + .min_ssbo_alignment = device.GetStorageBufferAlignment(), + .max_per_stage_descriptor_sampled_images = device.GetMaxPerStageDescriptorSampledImages(), + .max_per_stage_resources = device.GetMaxPerStageResources(), + .max_descriptor_set_samplers = device.GetMaxDescriptorSetSamplers(), + .max_descriptor_set_uniform_buffers = device.GetMaxDescriptorSetUniformBuffers(), + .max_descriptor_set_uniform_buffers_dynamic = device.GetMaxDescriptorSetUniformBuffersDynamic(), + .max_descriptor_set_storage_buffers = device.GetMaxDescriptorSetStorageBuffers(), + .max_descriptor_set_storage_buffers_dynamic = device.GetMaxDescriptorSetStorageBuffersDynamic(), + .max_descriptor_set_sampled_images = device.GetMaxDescriptorSetSampledImages(), + .max_descriptor_set_storage_images = device.GetMaxDescriptorSetStorageImages(), + .max_descriptor_set_input_attachements = device.GetMaxDescriptorSetInputAttachments(), .support_float64 = device.IsFloat64Supported(), .support_float16 = device.IsFloat16Supported(), .support_int64 = device.IsShaderInt64Supported(), @@ -451,13 +462,10 @@ PipelineCache::PipelineCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, driver_id == VK_DRIVER_ID_SAMSUNG_PROPRIETARY, .support_snorm_render_buffer = true, .support_viewport_index_layer = device.IsExtShaderViewportIndexLayerSupported(), - .min_ssbo_alignment = static_cast(device.GetStorageBufferAlignment()), - .max_per_stage_descriptor_sampled_images = device.GetMaxPerStageDescriptorSampledImages(), - .max_per_stage_resources = device.GetMaxPerStageResources(), - .max_descriptor_set_sampled_images = device.GetMaxDescriptorSetSampledImages(), .support_geometry_shader_passthrough = device.IsNvGeometryShaderPassthroughSupported(), .support_conditional_barrier = device.SupportsConditionalBarriers(), }; + host_info.ApplyDescriptorLimitPolicy(); if (device.GetMaxVertexInputAttributes() < Maxwell::NumVertexAttributes) { LOG_WARNING(Render_Vulkan, "maxVertexInputAttributes is too low: {} < {}", diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index dc84f5c0ad..863ebb72ec 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -203,7 +203,7 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra : gpu{gpu_}, device_memory{device_memory_}, device{device_}, memory_allocator{memory_allocator_}, state_tracker{state_tracker_}, scheduler{scheduler_}, staging_pool(device, memory_allocator, scheduler), descriptor_pool(device, scheduler), - guest_descriptor_queue(device, scheduler), compute_pass_descriptor_queue(device, scheduler), + guest_descriptor_queue(device), compute_pass_descriptor_queue(device), blit_image(device, scheduler, state_tracker, descriptor_pool), render_pass_cache(device), texture_cache_runtime{ device, scheduler, memory_allocator, staging_pool, diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index d1e064c34a..6a631f68a2 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -155,15 +155,14 @@ void Scheduler::WaitWorker() { } void Scheduler::DispatchWork() { - if (chunk->Empty()) { - return; + if (chunk && !chunk->Empty()) { + { + std::scoped_lock ql{queue_mutex}; + work_queue.push(std::move(chunk)); + } + event_cv.notify_all(); + AcquireNewChunk(); } - { - std::scoped_lock ql{queue_mutex}; - work_queue.push(std::move(chunk)); - } - event_cv.notify_all(); - AcquireNewChunk(); } void Scheduler::RequestRenderpass(const Framebuffer* framebuffer) { diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp index aa7de7b632..b0227ac908 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp @@ -7,6 +7,7 @@ #include #include +#include "common/assert.h" #include "common/logging.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" @@ -15,8 +16,9 @@ namespace Vulkan { -UpdateDescriptorQueue::UpdateDescriptorQueue(const Device& device_, Scheduler& scheduler_) - : device{device_}, scheduler{scheduler_} { +UpdateDescriptorQueue::UpdateDescriptorQueue(const Device& device_) + : device{device_} +{ payload_start = payload.data(); payload_cursor = payload.data(); } @@ -31,13 +33,15 @@ void UpdateDescriptorQueue::TickFrame() { payload_cursor = payload_start; } -void UpdateDescriptorQueue::Acquire() { - // Minimum number of entries required. - // This is the maximum number of entries a single draw call might use. - static constexpr size_t MIN_ENTRIES = 0x400; - - if (std::distance(payload_start, payload_cursor) + MIN_ENTRIES >= FRAME_PAYLOAD_SIZE) { - LOG_WARNING(Render_Vulkan, "Payload overflow, waiting for worker thread"); +void UpdateDescriptorQueue::Acquire(Scheduler& scheduler, size_t required_entries) { + static constexpr size_t DEFAULT_REQUIRED_ENTRIES = 0x400; + const size_t reserve = required_entries > 0 ? required_entries : DEFAULT_REQUIRED_ENTRIES; + ASSERT_MSG(reserve < FRAME_PAYLOAD_SIZE, "Descriptor reservation {} >= frame capacity {}", + reserve, FRAME_PAYLOAD_SIZE); + const size_t used = static_cast(std::distance(payload_start, payload_cursor)); + if (used + reserve >= FRAME_PAYLOAD_SIZE) { + LOG_WARNING(Render_Vulkan, "Payload overflow (used={}, reserve={}, capacity={})", + used, reserve, FRAME_PAYLOAD_SIZE); scheduler.WaitWorker(); payload_cursor = payload_start; } diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h index 1497108b16..39b2b5a688 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.h +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h @@ -34,12 +34,11 @@ class UpdateDescriptorQueue final { static constexpr size_t PAYLOAD_SIZE = FRAME_PAYLOAD_SIZE * FRAMES_IN_FLIGHT; public: - explicit UpdateDescriptorQueue(const Device& device_, Scheduler& scheduler_); + explicit UpdateDescriptorQueue(const Device& device_); ~UpdateDescriptorQueue(); void TickFrame(); - - void Acquire(); + void Acquire(Scheduler& scheduler, size_t required_entries = 0); const DescriptorUpdateEntry* UpdateData() const noexcept { return upload_start; @@ -75,8 +74,6 @@ public: private: const Device& device; - Scheduler& scheduler; - size_t frame_index{0}; DescriptorUpdateEntry* payload_cursor = nullptr; DescriptorUpdateEntry* payload_start = nullptr; diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 7f2c29519f..110d0c1199 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -321,32 +321,23 @@ public: return properties.properties.limits.maxPushConstantsSize; } - /// Returns the maximum size for shared memory. - u32 GetMaxComputeSharedMemorySize() const { - return properties.properties.limits.maxComputeSharedMemorySize; - } - - /// Returns the maximum number of dynamic storage buffer descriptors per set. - u32 GetMaxDescriptorSetStorageBuffersDynamic() const { - return properties.properties.limits.maxDescriptorSetStorageBuffersDynamic; - } - - /// Returns the maximum number of dynamic uniform buffer descriptors per set. - u32 GetMaxDescriptorSetUniformBuffersDynamic() const { - return properties.properties.limits.maxDescriptorSetUniformBuffersDynamic; - } - - u32 GetMaxPerStageDescriptorSampledImages() const { - return properties.properties.limits.maxPerStageDescriptorSampledImages; - } - - u32 GetMaxPerStageResources() const { - return properties.properties.limits.maxPerStageResources; - } - - u32 GetMaxDescriptorSetSampledImages() const { - return properties.properties.limits.maxDescriptorSetSampledImages; - } +#define FN_MAX_LIMIT_LIST \ + FN_MAX_LIMIT_ELEM(ComputeSharedMemorySize) \ + FN_MAX_LIMIT_ELEM(PerStageDescriptorSampledImages) \ + FN_MAX_LIMIT_ELEM(PerStageResources) \ + FN_MAX_LIMIT_ELEM(DescriptorSetSamplers) \ + FN_MAX_LIMIT_ELEM(DescriptorSetUniformBuffers) \ + FN_MAX_LIMIT_ELEM(DescriptorSetUniformBuffersDynamic) \ + FN_MAX_LIMIT_ELEM(DescriptorSetStorageBuffers) \ + FN_MAX_LIMIT_ELEM(DescriptorSetStorageBuffersDynamic) \ + FN_MAX_LIMIT_ELEM(DescriptorSetSampledImages) \ + FN_MAX_LIMIT_ELEM(DescriptorSetStorageImages) \ + FN_MAX_LIMIT_ELEM(DescriptorSetInputAttachments) +#define FN_MAX_LIMIT_ELEM(name) \ + u32 GetMax##name() const { return properties.properties.limits.max##name; } +FN_MAX_LIMIT_LIST +#undef FN_MAX_LIMIT_ELEM +#undef FN_MAX_LIMIT_LIST /// Returns float control properties of the device. const VkPhysicalDeviceFloatControlsPropertiesKHR& FloatControlProperties() const { diff --git a/tools/maxwell-ir/main.cpp b/tools/maxwell-ir/main.cpp index 66aeaeac40..645816878d 100644 --- a/tools/maxwell-ir/main.cpp +++ b/tools/maxwell-ir/main.cpp @@ -44,6 +44,7 @@ int IrShaderRecompilerImpl(int argc, char *argv[]) { host_info.support_geometry_shader_passthrough = true; host_info.support_conditional_barrier = true; host_info.min_ssbo_alignment = 0; + host_info.ApplyDescriptorLimitPolicy(); auto program = Shader::Maxwell::TranslateProgram(inst_pool, block_pool, env, cfg, host_info); auto const dumped_ir = Shader::IR::DumpProgram(program); std::printf("%s\n", dumped_ir.c_str()); diff --git a/tools/maxwell-spirv/spirv_recompiler_impl.cpp b/tools/maxwell-spirv/spirv_recompiler_impl.cpp index 55830abe39..9dae69df91 100644 --- a/tools/maxwell-spirv/spirv_recompiler_impl.cpp +++ b/tools/maxwell-spirv/spirv_recompiler_impl.cpp @@ -52,6 +52,7 @@ int SpirvShaderRecompilerImpl(int argc, char *argv[]) { host_info.support_geometry_shader_passthrough = true; host_info.support_conditional_barrier = true; host_info.min_ssbo_alignment = 0; + host_info.ApplyDescriptorLimitPolicy(); auto program = Shader::Maxwell::TranslateProgram(inst_pool, block_pool, env, cfg, host_info); // IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool& block_pool,