gsp: Fix GPU interrupt queue and add GPU timing emulation (#2095)
Some checks are pending
citra-build / source (push) Waiting to run
citra-build / linux-x86_64 (appimage) (push) Waiting to run
citra-build / linux-x86_64 (appimage-wayland) (push) Waiting to run
citra-build / linux-x86_64 (gcc-nopch) (push) Waiting to run
citra-build / linux-arm64 (clang) (push) Waiting to run
citra-build / linux-arm64 (gcc-nopch) (push) Waiting to run
citra-build / macos (push) Waiting to run
citra-build / windows (msvc) (push) Waiting to run
citra-build / windows (msys2) (push) Waiting to run
citra-build / android (googleplay) (push) Waiting to run
citra-build / android (vanilla) (push) Waiting to run
citra-build / docker (push) Waiting to run
citra-format / clang-format (push) Waiting to run
citra-libretro / android (push) Waiting to run
citra-libretro / linux (push) Waiting to run
citra-libretro / windows (push) Waiting to run
citra-libretro / macos (arm64) (push) Waiting to run
citra-libretro / macos (x86_64) (push) Waiting to run
citra-libretro / ios (push) Waiting to run
citra-libretro / tvos (push) Waiting to run
citra-transifex / transifex (push) Waiting to run

This commit is contained in:
PabloMK7 2026-05-07 01:36:21 +02:00 committed by GitHub
parent b081f800a4
commit 5ddbaeae23
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 592 additions and 41 deletions

View file

@ -48,6 +48,7 @@ foreach(KEY IN ITEMS
"texture_filter"
"texture_sampling"
"delay_game_render_thread_us"
"simulate_3ds_gpu_timings"
"layout_option"
"swap_screen"
"upright_screen"

View file

@ -45,6 +45,7 @@ object SettingKeys {
external fun texture_filter(): String
external fun texture_sampling(): String
external fun delay_game_render_thread_us(): String
external fun simulate_3ds_gpu_timings(): String
external fun layout_option(): String
external fun swap_screen(): String
external fun upright_screen(): String

View file

@ -56,7 +56,8 @@ enum class BooleanSetting(
COMPRESS_INSTALLED_CIA_CONTENT(SettingKeys.compress_cia_installs(), Settings.SECTION_STORAGE, false),
ANDROID_HIDE_IMAGES(SettingKeys.android_hide_images(), Settings.SECTION_MISC, false),
APPLY_REGION_FREE_PATCH(SettingKeys.apply_region_free_patch(), Settings.SECTION_SYSTEM, true),
USE_INTEGER_SCALING(SettingKeys.use_integer_scaling(), Settings.SECTION_RENDERER, false);
USE_INTEGER_SCALING(SettingKeys.use_integer_scaling(), Settings.SECTION_RENDERER, false),
SIMULATE_3DS_GPU_TIMINGS(SettingKeys.simulate_3ds_gpu_timings(), Settings.SECTION_RENDERER, true);
override var boolean: Boolean = defaultValue

View file

@ -1790,6 +1790,15 @@ class SettingsFragmentPresenter(private val fragmentView: SettingsFragmentView)
BooleanSetting.VSYNC.defaultValue
)
)
add(
SwitchSetting(
BooleanSetting.SIMULATE_3DS_GPU_TIMINGS,
R.string.simulate_3ds_gpu_timings,
R.string.simulate_3ds_gpu_timings_description,
BooleanSetting.SIMULATE_3DS_GPU_TIMINGS.key,
BooleanSetting.SIMULATE_3DS_GPU_TIMINGS.defaultValue
)
)
add(
SwitchSetting(
BooleanSetting.DEBUG_RENDERER,

View file

@ -179,6 +179,7 @@ void Config::ReadValues() {
ReadSetting("Renderer", Settings::values.bg_blue);
ReadSetting("Renderer", Settings::values.custom_second_layer_opacity);
ReadSetting("Renderer", Settings::values.delay_game_render_thread_us);
ReadSetting("Renderer", Settings::values.simulate_3ds_gpu_timings);
ReadSetting("Renderer", Settings::values.disable_right_eye_render);
ReadSetting("Renderer", Settings::values.swap_eyes_3d);
ReadSetting("Renderer", Settings::values.render_3d_which_display);

View file

@ -196,6 +196,10 @@ static const char* android_config_default_file_content = (BOOST_HANA_STRING(R"(
# Set to 0 for no delay, only useful in dynamic-fps games to simulate GPU delay.
)") DECLARE_KEY(delay_game_render_thread_us) BOOST_HANA_STRING(R"(
# Delays GPU completion events based on measurements taken from real hardware
# 0: No delay, 1 (default): Enable delay
)") DECLARE_KEY(simulate_3ds_gpu_timings) BOOST_HANA_STRING(R"(
# Disables rendering the right eye image
# Greatly improves performance in some games, but can cause flickering in others.
# 0 : Enable right eye rendering, 1: Disable right eye rendering

View file

@ -264,6 +264,8 @@
<string name="texture_filter_description">Enhances the visuals of applications by applying a filter to textures. The supported filters are Anime4K Ultrafast, Bicubic, ScaleForce, xBRZ freescale, and MMPX.</string>
<string name="delay_render_thread">Delay Game Render Thread</string>
<string name="delay_render_thread_description">Delay the game render thread when it submits data to the GPU. Helps with performance issues in the (very few) applications with dynamic framerates.</string>
<string name="simulate_3ds_gpu_timings">Simulate 3DS GPU Timings</string>
<string name="simulate_3ds_gpu_timings_description">Delays GPU completion events based on measurements taken from real hardware, so that games have more realistic GPU time measurements. Helps stabilize dynamic FPS games. Disabling this feature may improve performance in some rare cases at the cost of stability.</string>
<string name="advanced">Advanced</string>
<string name="texture_sampling_name">Texture Sampling</string>
<string name="texture_sampling_description">Overrides the sampling filter used by games. This can be useful in certain cases with poorly behaved games when upscaling. If unsure, set this to Game Controlled.</string>

View file

@ -722,6 +722,8 @@ void QtConfig::ReadRendererValues() {
ReadGlobalSetting(Settings::values.delay_game_render_thread_us);
ReadGlobalSetting(Settings::values.disable_right_eye_render);
ReadGlobalSetting(Settings::values.simulate_3ds_gpu_timings);
if (global) {
ReadBasicSetting(Settings::values.use_shader_jit);
}
@ -1266,6 +1268,8 @@ void QtConfig::SaveRendererValues() {
WriteGlobalSetting(Settings::values.delay_game_render_thread_us);
WriteGlobalSetting(Settings::values.disable_right_eye_render);
WriteGlobalSetting(Settings::values.simulate_3ds_gpu_timings);
if (global) {
WriteSetting(Settings::QKeys::use_shader_jit, Settings::values.use_shader_jit.GetValue(),
true);

View file

@ -154,6 +154,7 @@ void ConfigureGraphics::SetConfiguration() {
ui->toggle_display_refresh_rate_detection->setChecked(
Settings::values.use_display_refresh_rate_detection.GetValue());
}
ui->simulate_3ds_gpu_timings->setChecked(Settings::values.simulate_3ds_gpu_timings.GetValue());
}
void ConfigureGraphics::ApplyConfiguration() {
@ -182,6 +183,9 @@ void ConfigureGraphics::ApplyConfiguration() {
ConfigurationShared::ApplyPerGameSetting(
&Settings::values.delay_game_render_thread_us, ui->delay_render_combo,
[this](s32) { return ui->delay_render_slider->value(); });
ConfigurationShared::ApplyPerGameSetting(&Settings::values.simulate_3ds_gpu_timings,
ui->simulate_3ds_gpu_timings,
simulate_3ds_gpu_timings);
if (Settings::IsConfiguringGlobal()) {
Settings::values.use_shader_jit = ui->toggle_shader_jit->isChecked();
@ -212,6 +216,8 @@ void ConfigureGraphics::SetupPerGameUI() {
ui->physical_device_combo->setEnabled(Settings::values.physical_device.UsingGlobal());
ui->delay_render_combo->setEnabled(
Settings::values.delay_game_render_thread_us.UsingGlobal());
ui->simulate_3ds_gpu_timings->setEnabled(
Settings::values.simulate_3ds_gpu_timings.UsingGlobal());
return;
}
@ -254,6 +260,9 @@ void ConfigureGraphics::SetupPerGameUI() {
ConfigurationShared::SetColoredTristate(ui->disable_spirv_optimizer,
Settings::values.disable_spirv_optimizer,
disable_spirv_optimizer);
ConfigurationShared::SetColoredTristate(ui->simulate_3ds_gpu_timings,
Settings::values.simulate_3ds_gpu_timings,
simulate_3ds_gpu_timings);
}
void ConfigureGraphics::SetPhysicalDeviceComboVisibility(int index) {

View file

@ -44,6 +44,7 @@ private:
ConfigurationShared::CheckState async_presentation;
ConfigurationShared::CheckState spirv_shader_gen;
ConfigurationShared::CheckState disable_spirv_optimizer;
ConfigurationShared::CheckState simulate_3ds_gpu_timings;
std::unique_ptr<Ui::ConfigureGraphics> ui;
QColor bg_color;
};

View file

@ -372,7 +372,7 @@
<number>0</number>
</property>
<property name="maximum">
<number>16000</number>
<number>65000</number>
</property>
<property name="singleStep">
<number>100</number>
@ -404,6 +404,16 @@
</layout>
</widget>
</item>
<item>
<widget class="QCheckBox" name="simulate_3ds_gpu_timings">
<property name="toolTip">
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;Delays GPU completion events based on measurements taken from real hardware, so that games have more realistic GPU time measurements. Helps stabilize dynamic FPS games. Disabling this feature may improve performance in some rare cases at the cost of stability.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
</property>
<property name="text">
<string>Simulate 3DS GPU timings</string>
</property>
</widget>
</item>
</layout>
</widget>
</item>

View file

@ -198,5 +198,18 @@ HackManager hack_manager = {
0x00040000001D1A00, // EUR
},
}},
{HackType::DELAY_TEXTURE_COPY_COMPLETION,
HackEntry{
.mode = HackAllowMode::FORCE,
.affected_title_ids =
{
// Super Mario 3D Land
0x0004000000054100, // JPN
0x0004000000054000, // USA
0x0004000000053F00, // EUR
0x0004000000089E00, // CHN
0x0004000000089D00, // KOR
},
}},
}};
}

View file

@ -16,6 +16,7 @@ enum class HackType : int {
REGION_FROM_SECURE,
REQUIRES_SHADER_FIXUP,
SPOOF_FRIEND_CODE_SEED,
DELAY_TEXTURE_COPY_COMPLETION,
};
class UserHackData {};

View file

@ -106,6 +106,7 @@ void LogSettings() {
log_setting("Renderer_TextureSampling",
GetTextureSamplingName(values.texture_sampling.GetValue()));
log_setting("Renderer_DelayGameRenderThreasUs", values.delay_game_render_thread_us.GetValue());
log_setting("Renderer_Simulate3DSGPUTimings", values.simulate_3ds_gpu_timings.GetValue());
log_setting("Renderer_DisableRightEyeRender", values.disable_right_eye_render.GetValue());
log_setting("Stereoscopy_Render3d", values.render_3d.GetValue());
log_setting("Stereoscopy_Factor3d", values.factor_3d.GetValue());
@ -215,6 +216,7 @@ void RestoreGlobalState(bool is_powered_on) {
values.texture_filter.SetGlobal(true);
values.texture_sampling.SetGlobal(true);
values.delay_game_render_thread_us.SetGlobal(true);
values.simulate_3ds_gpu_timings.SetGlobal(true);
values.layout_option.SetGlobal(true);
values.portrait_layout_option.SetGlobal(true);
values.secondary_display_layout.SetGlobal(true);

View file

@ -540,8 +540,9 @@ struct Values {
SwitchableSetting<TextureFilter> texture_filter{TextureFilter::NoFilter, Keys::texture_filter};
SwitchableSetting<TextureSampling> texture_sampling{TextureSampling::GameControlled,
Keys::texture_sampling};
SwitchableSetting<u16, true> delay_game_render_thread_us{0, 0, 16000,
SwitchableSetting<u16, true> delay_game_render_thread_us{0, 0, 65000,
Keys::delay_game_render_thread_us};
SwitchableSetting<bool> simulate_3ds_gpu_timings{true, Keys::simulate_3ds_gpu_timings};
SwitchableSetting<LayoutOption> layout_option{LayoutOption::Default, Keys::layout_option};
SwitchableSetting<bool> swap_screen{false, Keys::swap_screen};

View file

@ -581,8 +581,9 @@ System::ResultStatus System::Init(Frontend::EmuWindow& emu_window,
auto gsp = service_manager->GetService<Service::GSP::GSP_GPU>("gsp::Gpu");
gpu = std::make_unique<VideoCore::GPU>(*this, emu_window, secondary_window);
gpu->SetInterruptHandler(
[gsp](Service::GSP::InterruptId interrupt_id) { gsp->SignalInterrupt(interrupt_id); });
gpu->SetInterruptHandler([gsp](Service::GSP::InterruptId interrupt_id, u64 wait_delay_ns) {
gsp->SignalInterrupt(interrupt_id, wait_delay_ns);
});
auto plg_ldr = Service::PLGLDR::GetService(*this);
if (plg_ldr) {
@ -902,8 +903,9 @@ void System::serialize(Archive& ar, const unsigned int file_version) {
// Re-register gpu callback, because gsp service changed after service_manager got
// serialized
auto gsp = service_manager->GetService<Service::GSP::GSP_GPU>("gsp::Gpu");
gpu->SetInterruptHandler(
[gsp](Service::GSP::InterruptId interrupt_id) { gsp->SignalInterrupt(interrupt_id); });
gpu->SetInterruptHandler([gsp](Service::GSP::InterruptId interrupt_id, u64 wait_delay_ns) {
gsp->SignalInterrupt(interrupt_id, wait_delay_ns);
});
// Apply per program settings and switch the shader cache to the title running when the
// savestate was created.

View file

@ -12,6 +12,7 @@
#include "common/hacks/hack_manager.h"
#include "common/settings.h"
#include "core/core.h"
#include "core/core_timing.h"
#include "core/hle/ipc_helpers.h"
#include "core/hle/kernel/shared_memory.h"
#include "core/hle/kernel/shared_page.h"
@ -295,6 +296,23 @@ void GSP_GPU::SetAxiConfigQoSMode(Kernel::HLERequestContext& ctx) {
LOG_DEBUG(Service_GSP, "(STUBBED) called mode=0x{:08X}", mode);
}
void GSP_GPU::SetPerfLogMode(Kernel::HLERequestContext& ctx) {
IPC::RequestParser rp(ctx);
bool enabled = rp.Pop<u32>() != 0;
perf_recorder.SetEnabled(enabled);
IPC::RequestBuilder rb = rp.MakeBuilder(1, 0);
rb.Push(ResultSuccess);
}
void GSP_GPU::GetPerfLog(Kernel::HLERequestContext& ctx) {
IPC::RequestParser rp(ctx);
IPC::RequestBuilder rb = rp.MakeBuilder(15, 0);
rb.PushRaw(perf_recorder.GetResults());
}
void GSP_GPU::RegisterInterruptRelayQueue(Kernel::HLERequestContext& ctx) {
IPC::RequestParser rp(ctx);
u32 flags = rp.Pop<u32>();
@ -337,7 +355,124 @@ void GSP_GPU::UnregisterInterruptRelayQueue(Kernel::HLERequestContext& ctx) {
LOG_DEBUG(Service_GSP, "called");
}
void GSP_GPU::SignalInterruptForThread(InterruptId interrupt_id, u32 thread_id) {
// Uncomment the following line to display the average delay calculated for every frame.
// #define SHOW_AVERAGE_TIME_PER_FRAME
void GSP_GPU::SignalInterruptForThread(InterruptId interrupt_id, u32 thread_id, u64 wait_delay_ns) {
// Every gsp request takes a constant amount of time to be
// processed and control returned to the application. This
// time is estimated below.
static constexpr u64 sync_delay_nanoseconds = 300 * 1000;
// For a reason not yet understood, Super Mario 3D Land hangs on a white screen after the title
// screen when any of the save slots have the completion star icons. This is in some way related
// to the timings of texture copy commands, and gets fixed if we increase the amount of time
// those take. This issue may be resolved as timings become more accurate in the future.
static constexpr u64 sync_delay_nanoseconds_delayed_texcopy = 1000 * 1000;
#ifdef SHOW_AVERAGE_TIME_PER_FRAME
auto track_average = [&](bool is_vsync) {
using clock = std::chrono::steady_clock;
static uint64_t total_ns = 0;
static uint64_t sample_count = 0;
static auto last_print = clock::now();
if (!is_vsync) {
total_ns += wait_delay_ns;
++sample_count;
}
auto now = clock::now();
if (now - last_print >= std::chrono::milliseconds(250)) {
double average_ns =
(sample_count > 0)
? (static_cast<double>(total_ns) / static_cast<double>(sample_count))
: 0;
LOG_INFO(Service_GSP, "Average delay milliseconds per frame: {}",
average_ns / 1000000.f);
total_ns = 0;
sample_count = 0;
last_print = now;
}
};
#endif
// Signal VBlank interrupt immediately, this interrupt is signaled from
// an scheduler event so it already has the proper timing.
if (interrupt_id == InterruptId::PDC0 || interrupt_id == InterruptId::PDC1) {
#ifdef SHOW_AVERAGE_TIME_PER_FRAME
track_average(true);
#endif
if (perf_recorder.IsEnabled()) {
constexpr u64 nanoseconds_per_frame = static_cast<u64>(
((static_cast<double>(VideoCore::FRAME_TICKS) / BASE_CLOCK_RATE_ARM11) * 1e9));
perf_recorder.UpdateTime(interrupt_id, nanoseconds_per_frame);
}
ProcessPendingInterruptImpl(interrupt_id, thread_id);
return;
}
if (perf_recorder.IsEnabled()) {
perf_recorder.UpdateTime(interrupt_id, wait_delay_ns);
}
if (Settings::values.simulate_3ds_gpu_timings.GetValue()) {
if (delay_texture_copy_completion) {
wait_delay_ns += (interrupt_id == InterruptId::PPF)
? sync_delay_nanoseconds_delayed_texcopy
: sync_delay_nanoseconds;
} else {
wait_delay_ns += sync_delay_nanoseconds;
}
} else {
if (delay_texture_copy_completion && interrupt_id == InterruptId::PPF) {
wait_delay_ns += sync_delay_nanoseconds_delayed_texcopy;
} else {
wait_delay_ns = 0;
}
}
#ifdef SHOW_AVERAGE_TIME_PER_FRAME
track_average(false);
#endif
if (wait_delay_ns) {
size_t pending_interrupt_id =
pending_interrupts.Push(std::make_pair(interrupt_id, thread_id));
if (pending_interrupt_id == std::numeric_limits<size_t>::max()) {
LOG_ERROR(Service_GSP, "Pending interrupts queue is full");
ProcessPendingInterruptImpl(interrupt_id, thread_id);
} else {
system.Kernel().timing.ScheduleEvent(nsToCycles(wait_delay_ns),
SignalInterruptEventType,
static_cast<uintptr_t>(pending_interrupt_id));
}
} else {
ProcessPendingInterruptImpl(interrupt_id, thread_id);
}
}
void Service::GSP::GSP_GPU::ProcessPendingInterrupt(size_t pending_interrupt_id) {
auto pending_interrupt = pending_interrupts.Pop(pending_interrupt_id);
if (!pending_interrupt.has_value()) {
return;
}
const auto& [interrupt_id, thread_id] = *pending_interrupt;
ProcessPendingInterruptImpl(interrupt_id, thread_id);
}
void Service::GSP::GSP_GPU::ProcessPendingInterruptImpl(InterruptId interrupt_id, u32 thread_id) {
SessionData* session_data = FindRegisteredThreadData(thread_id);
if (!session_data) {
return;
@ -349,32 +484,55 @@ void GSP_GPU::SignalInterruptForThread(InterruptId interrupt_id, u32 thread_id)
return;
}
const bool is_pdc = interrupt_id == InterruptId::PDC0 || interrupt_id == InterruptId::PDC1;
auto* interrupt_relay_queue = GetInterruptRelayQueue(thread_id);
u8 next = interrupt_relay_queue->index;
next += interrupt_relay_queue->number_interrupts;
next = next % 0x34; // 0x34 is the number of interrupt slots
interrupt_relay_queue->number_interrupts += 1;
auto queue_interrupt = [&]() {
if (interrupt_relay_queue->number_interrupts >= InterruptRelayQueue::max_slots) {
interrupt_relay_queue->error_code = InterruptRelayQueue::queue_full_error;
} else {
u8 next = interrupt_relay_queue->index;
next += interrupt_relay_queue->number_interrupts;
next %= InterruptRelayQueue::max_slots;
interrupt_relay_queue->slot[next] = interrupt_id;
interrupt_relay_queue->error_code = 0x0; // No error
interrupt_relay_queue->number_interrupts += 1;
interrupt_relay_queue->slot[next] = interrupt_id;
interrupt_event->Signal();
}
};
if (is_pdc) {
if (!interrupt_relay_queue->ignore_pdc.Value()) {
if (interrupt_relay_queue->number_interrupts >=
InterruptRelayQueue::stop_queuing_pdc_threeshold) {
if (interrupt_id == InterruptId::PDC0) {
interrupt_relay_queue->missed_PDC0++;
} else {
interrupt_relay_queue->missed_PDC1++;
}
} else {
queue_interrupt();
}
}
// Update framebuffer information if requested
const s32 screen_id = (interrupt_id == InterruptId::PDC0) ? 0 : 1;
// Update framebuffer information if requested
const s32 screen_id = (interrupt_id == InterruptId::PDC0) ? 0
: (interrupt_id == InterruptId::PDC1) ? 1
: -1;
if (screen_id != -1) {
auto* info = GetFrameBufferInfo(thread_id, screen_id);
if (info->is_dirty) {
system.GPU().SetBufferSwap(screen_id, info->framebuffer_info[info->index]);
info->is_dirty.Assign(false);
}
}
interrupt_event->Signal();
} else {
queue_interrupt();
}
}
void GSP_GPU::SignalInterrupt(InterruptId interrupt_id) {
void GSP_GPU::SignalInterrupt(InterruptId interrupt_id, u64 wait_delay_ns) {
if (nullptr == shared_memory) {
LOG_WARNING(Service_GSP, "cannot synchronize until GSP shared memory has been created!");
return;
@ -385,7 +543,7 @@ void GSP_GPU::SignalInterrupt(InterruptId interrupt_id) {
// right), but the PDC0/1 interrupts are signaled for every registered thread.
if (interrupt_id == InterruptId::PDC0 || interrupt_id == InterruptId::PDC1) {
for (u32 thread_id = 0; thread_id < MaxGSPThreads; ++thread_id) {
SignalInterruptForThread(interrupt_id, thread_id);
SignalInterruptForThread(interrupt_id, thread_id, wait_delay_ns);
}
return;
}
@ -395,7 +553,7 @@ void GSP_GPU::SignalInterrupt(InterruptId interrupt_id) {
return;
}
SignalInterruptForThread(interrupt_id, active_thread_id);
SignalInterruptForThread(interrupt_id, active_thread_id, wait_delay_ns);
}
void GSP_GPU::SetLcdForceBlack(Kernel::HLERequestContext& ctx) {
@ -692,6 +850,11 @@ Result GSP_GPU::AcquireGpuRight(const Kernel::HLERequestContext& ctx,
Common::Hacks::HackType::REQUIRES_SHADER_FIXUP, process->codeset->program_id,
Common::Hacks::HackAllowMode::DISALLOW) != Common::Hacks::HackAllowMode::DISALLOW;
delay_texture_copy_completion =
Common::Hacks::hack_manager.GetHackAllowMode(
Common::Hacks::HackType::DELAY_TEXTURE_COPY_COMPLETION, process->codeset->program_id,
Common::Hacks::HackAllowMode::DISALLOW) != Common::Hacks::HackAllowMode::DISALLOW;
auto& gpu = system.GPU();
gpu.ApplyPerProgramSettings(process->codeset->program_id);
gpu.GetRightEyeDisabler().SetEnabled(right_eye_disable_allow);
@ -818,6 +981,9 @@ void GSP_GPU::serialize(Archive& ar, const unsigned int) {
ar & first_initialization;
ar & used_thread_ids;
ar & saved_vram;
ar & delay_texture_copy_completion;
ar & pending_interrupts;
ar & perf_recorder;
}
SERIALIZE_IMPL(GSP_GPU)
@ -840,8 +1006,8 @@ GSP_GPU::GSP_GPU(Core::System& system) : ServiceFramework("gsp::Gpu", 4), system
{0x000E, nullptr, "SetTextureCopy"},
{0x000F, nullptr, "SetMemoryFill"},
{0x0010, &GSP_GPU::SetAxiConfigQoSMode, "SetAxiConfigQoSMode"},
{0x0011, nullptr, "SetPerfLogMode"},
{0x0012, nullptr, "GetPerfLog"},
{0x0011, &GSP_GPU::SetPerfLogMode, "SetPerfLogMode"},
{0x0012, &GSP_GPU::GetPerfLog, "GetPerfLog"},
{0x0013, &GSP_GPU::RegisterInterruptRelayQueue, "RegisterInterruptRelayQueue"},
{0x0014, &GSP_GPU::UnregisterInterruptRelayQueue, "UnregisterInterruptRelayQueue"},
{0x0015, &GSP_GPU::TryAcquireRight, "TryAcquireRight"},
@ -866,6 +1032,11 @@ GSP_GPU::GSP_GPU(Core::System& system) : ServiceFramework("gsp::Gpu", 4), system
Kernel::MemoryRegion::BASE, "GSP:SharedMemory")
.Unwrap();
SignalInterruptEventType = system.Kernel().timing.RegisterEvent(
"GSPPendingInterrupt", [this](uintptr_t arg, s64 cycle_late) {
ProcessPendingInterrupt(static_cast<size_t>(arg));
});
first_initialization = true;
};

View file

@ -4,8 +4,10 @@
#pragma once
#include <array>
#include <cstddef>
#include <memory>
#include <optional>
#include <string>
#include <boost/optional/optional.hpp>
#include <boost/serialization/export.hpp>
@ -104,7 +106,7 @@ public:
* Signals that the specified interrupt type has occurred to userland code
* @param interrupt_id ID of interrupt that is being signalled
*/
void SignalInterrupt(InterruptId interrupt_id);
void SignalInterrupt(InterruptId interrupt_id, u64 wait_delay_ns);
/**
* Retrieves the framebuffer info stored in the GSP shared memory for the
@ -143,7 +145,11 @@ private:
* @param interrupt_id ID of interrupt that is being signalled.
* @param thread_id GSP thread that will receive the interrupt.
*/
void SignalInterruptForThread(InterruptId interrupt_id, u32 thread_id);
void SignalInterruptForThread(InterruptId interrupt_id, u32 thread_id, u64 wait_delay_ns);
void ProcessPendingInterrupt(size_t pending_interrupt_id);
void ProcessPendingInterruptImpl(InterruptId interrupt_id, u32 thread_id);
/**
* GSP_GPU::WriteHWRegs service function
@ -240,6 +246,10 @@ private:
*/
void SetAxiConfigQoSMode(Kernel::HLERequestContext& ctx);
void SetPerfLogMode(Kernel::HLERequestContext& ctx);
void GetPerfLog(Kernel::HLERequestContext& ctx);
/**
* GSP_GPU::RegisterInterruptRelayQueue service function
* Inputs:
@ -405,6 +415,118 @@ private:
/// Thread ids currently in use by the sessions connected to the GSPGPU service.
std::array<bool, MaxGSPThreads> used_thread_ids{};
/// The current thread needs a longer emulated texture copy completion
bool delay_texture_copy_completion{};
class PendingInterruptArray {
public:
PendingInterruptArray() {
for (size_t i = 0; i < array_size; i++) {
elements[i].first = InterruptId::COUNT;
}
}
size_t Push(const std::pair<InterruptId, u32> elem) {
if (elements[head].first != InterruptId::COUNT) {
// If the head position is occupied, the queue is full
return std::numeric_limits<size_t>::max();
}
elements[head] = elem;
size_t index = head;
head = (head + 1) % array_size;
return index;
}
std::optional<std::pair<InterruptId, u32>> Pop(size_t at) {
if (at >= array_size || elements[at].first == InterruptId::COUNT) {
// Invalid index or already free
return std::nullopt;
}
std::pair<InterruptId, u32> value = elements[at];
elements[at].first = InterruptId::COUNT;
return value;
}
private:
static constexpr size_t array_size = 512;
size_t head = 0;
std::array<std::pair<InterruptId, u32>, array_size> elements;
template <class Archive>
void serialize(Archive& ar, const unsigned int) {
ar & elements;
ar & head;
}
friend class boost::serialization::access;
};
class PerformanceRecorder {
public:
struct PerformanceEntry {
u32 delta_time{};
u32 sum_time{};
template <class Archive>
void serialize(Archive& ar, const unsigned int) {
ar & delta_time;
ar & sum_time;
}
friend class boost::serialization::access;
};
using PerfArray = std::array<PerformanceEntry, static_cast<u8>(InterruptId::COUNT)>;
PerformanceRecorder() = default;
void Reset() {
entries.fill({});
}
bool IsEnabled() {
return enabled;
}
void SetEnabled(bool _enabled) {
enabled = _enabled;
if (enabled) {
Reset();
}
}
void UpdateTime(InterruptId id, u64 nanoseconds) {
// These counters may overflow, which is normal.
entries[static_cast<u8>(id)].delta_time = static_cast<u32>(nanoseconds);
entries[static_cast<u8>(id)].sum_time += static_cast<u32>(nanoseconds);
}
const PerfArray& GetResults() {
return entries;
}
private:
PerfArray entries{};
bool enabled{};
template <class Archive>
void serialize(Archive& ar, const unsigned int) {
ar & entries;
ar & enabled;
}
friend class boost::serialization::access;
};
// This array is only needed to keep track of delayed notifications and simulate the GPU
// taking some time to finish the work, it doesn't exist on real hardware.
PendingInterruptArray pending_interrupts;
PerformanceRecorder perf_recorder;
Core::TimingEventType* SignalInterruptEventType = nullptr;
friend class SessionData;
template <class Archive>

View file

@ -1,10 +1,11 @@
// Copyright 2023 Citra Emulator Project
// Copyright Citra Emulator Project / Azahar Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <functional>
#include "common/bit_field.h"
#include "common/common_types.h"
namespace Service::GSP {
@ -18,25 +19,35 @@ enum class InterruptId : u8 {
PPF = 0x04,
P3D = 0x05,
DMA = 0x06,
COUNT,
};
/// GSP thread interrupt relay queue
struct InterruptRelayQueue {
static constexpr size_t max_slots = 0x34;
static constexpr size_t stop_queuing_pdc_threeshold = 0x20;
static constexpr u8 queue_full_error = 0x1;
// Index of last interrupt in the queue
u8 index;
// Number of interrupts remaining to be processed by the userland code
u8 number_interrupts;
// Error code - zero on success, otherwise an error has occurred
u8 error_code;
u8 padding1;
union {
u8 config;
BitField<0, 1, u8> ignore_pdc;
};
u32 missed_PDC0;
u32 missed_PDC1;
InterruptId slot[0x34]; ///< Interrupt ID slots
InterruptId slot[max_slots]; ///< Interrupt ID slots
};
static_assert(sizeof(InterruptRelayQueue) == 0x40, "InterruptRelayQueue struct has incorrect size");
using InterruptHandler = std::function<void(InterruptId)>;
using InterruptHandler = std::function<void(InterruptId, u64)>;
} // namespace Service::GSP

View file

@ -26,6 +26,63 @@ namespace VideoCore {
constexpr VAddr VADDR_LCD = 0x1ED02000;
constexpr VAddr VADDR_GPU = 0x1EF00000;
class DelayGenerator {
private:
DelayGenerator() = default;
// Average transfer speed based on measurements taken from real
// hardware. 4 different modes have been taken into consideration:
// RAM -> RAM, RAM -> VRAM, VRAM -> RAM and VRAM -> VRAM.
// Furthermore, measurements are split into DMA transfers and tex
// copies. For simplicity, we will assume fills are as fast as
// texture copies.
static constexpr double mibps_to_ns_per_byte(double mib_per_sec) {
return 1'000'000'000.0 / (mib_per_sec * 1024.0 * 1024.0);
}
static constexpr std::array<std::array<double, 4>, 2> speed_mibps = {
{{
190.0, // DMA RAMTORAM
310.0, // DMA RAMTOVRAM
380.0, // DMA VRAMTORAM
380.0, // DMA VRAMTOVRAM
},
{
450.0, // TEX RAMTORAM
3100.0, // TEX RAMTOVRAM
5400.0, // TEX VRAMTORAM
5400.0, // TEX VRAMTOVRAM
}}};
public:
enum class CopyMode {
RAMTORAM,
RAMTOVRAM,
VRAMTORAM,
VRAMTOVRAM,
};
static CopyMode GetCopyMode(bool input_vram, bool output_vram) {
if (!input_vram && !output_vram) {
return CopyMode::RAMTORAM;
} else if (!input_vram && output_vram) {
return CopyMode::RAMTOVRAM;
} else if (input_vram && !output_vram) {
return CopyMode::VRAMTORAM;
} else {
return CopyMode::VRAMTOVRAM;
}
}
static u64 CalculateDelayNanoseconds(CopyMode mode, bool is_textre, size_t size) {
double base_ns_per_byte =
mibps_to_ns_per_byte(speed_mibps[is_textre][static_cast<u32>(mode)]);
return static_cast<u64>(size * base_ns_per_byte);
}
};
MICROPROFILE_DEFINE(GPU_DisplayTransfer, "GPU", "DisplayTransfer", MP_RGB(100, 100, 255));
MICROPROFILE_DEFINE(GPU_CmdlistProcessing, "GPU", "Cmdlist Processing", MP_RGB(100, 255, 100));
@ -102,7 +159,17 @@ void GPU::Execute(const Service::GSP::Command& command) {
const auto process = impl->system.Kernel().GetCurrentProcess();
impl->memory.CopyBlock(*process, command.dma_request.dest_address,
command.dma_request.source_address, command.dma_request.size);
impl->signal_interrupt(Service::GSP::InterruptId::DMA);
auto is_vram = [&](u32 addr) {
return addr >= Memory::VRAM_VADDR && addr <= Memory::VRAM_VADDR_END;
};
u64 delay = DelayGenerator::CalculateDelayNanoseconds(
DelayGenerator::GetCopyMode(is_vram(command.dma_request.source_address),
is_vram(command.dma_request.dest_address)),
false, command.dma_request.size);
impl->signal_interrupt(Service::GSP::InterruptId::DMA, delay);
break;
}
case CommandId::SubmitCmdList: {
@ -361,13 +428,18 @@ void GPU::MemoryFill(u32 index, u32 intr_index) {
impl->sw_blitter->MemoryFill(config);
}
// Treat fill as texture transfer from VRAM
u64 delay = DelayGenerator::CalculateDelayNanoseconds(
DelayGenerator::GetCopyMode(true, config.IsVRAM()), true,
config.GetEndAddress() - config.GetStartAddress());
// It seems that it won't signal interrupt if "address_start" is zero.
// TODO: hwtest this
if (config.GetStartAddress() != 0) {
if (intr_index == 0) {
impl->signal_interrupt(Service::GSP::InterruptId::PSC0);
impl->signal_interrupt(Service::GSP::InterruptId::PSC0, delay);
} else if (intr_index == 1) {
impl->signal_interrupt(Service::GSP::InterruptId::PSC1);
impl->signal_interrupt(Service::GSP::InterruptId::PSC1, delay);
}
}
@ -391,11 +463,15 @@ void GPU::MemoryTransfer() {
impl->debug_context->OnEvent(Pica::DebugContext::Event::IncomingDisplayTransfer, nullptr);
}
u64 delay{};
// Perform memory transfer
if (config.is_texture_copy) {
if (!impl->rasterizer->AccelerateTextureCopy(config)) {
impl->sw_blitter->TextureCopy(config);
}
delay = DelayGenerator::CalculateDelayNanoseconds(
DelayGenerator::GetCopyMode(config.IsInputVRAM(), config.IsOutputVRAM()), true,
config.texture_copy.size);
} else {
if (right_eye_disabler->ShouldAllowDisplayTransfer(config.GetPhysicalInputAddress(),
config.input_height)) {
@ -403,11 +479,14 @@ void GPU::MemoryTransfer() {
impl->sw_blitter->DisplayTransfer(config);
}
}
delay = DelayGenerator::CalculateDelayNanoseconds(
DelayGenerator::GetCopyMode(config.IsInputVRAM(), config.IsOutputVRAM()), true,
config.input_width * config.input_height * BytesPerPixel(config.input_format));
}
// Complete transfer.
config.trigger.Assign(0);
impl->signal_interrupt(Service::GSP::InterruptId::PPF);
impl->signal_interrupt(Service::GSP::InterruptId::PPF, delay);
}
void GPU::VBlankCallback(std::uintptr_t user_data, s64 cycles_late) {
@ -415,8 +494,8 @@ void GPU::VBlankCallback(std::uintptr_t user_data, s64 cycles_late) {
impl->renderer->SwapBuffers();
// Signal to GSP that GPU interrupt has occurred
impl->signal_interrupt(Service::GSP::InterruptId::PDC0);
impl->signal_interrupt(Service::GSP::InterruptId::PDC1);
impl->signal_interrupt(Service::GSP::InterruptId::PDC0, 0);
impl->signal_interrupt(Service::GSP::InterruptId::PDC1, 0);
// Reschedule recurrent event
impl->timing.ScheduleEvent(FRAME_TICKS - cycles_late, impl->vblank_event);

View file

@ -98,7 +98,7 @@ void PicaCore::SetInterruptHandler(Service::GSP::InterruptHandler& signal_interr
void PicaCore::ProcessCmdList(PAddr list, u32 size, bool ignore_list) {
if (ignore_list) {
signal_interrupt(Service::GSP::InterruptId::P3D);
signal_interrupt(Service::GSP::InterruptId::P3D, delay_generator.CalculateAndResetDelay());
return;
}
// Initialize command list tracking.
@ -148,6 +148,8 @@ void PicaCore::WriteInternalReg(u32 id, u32 value, u32 mask, bool& stop_requeste
return;
}
delay_generator.AddCommands(1);
// Expand a 4-bit mask to 4-byte mask, e.g. 0b0101 -> 0x00FF00FF
constexpr std::array<u32, 16> ExpandBitsToBytes = {
0x00000000, 0x000000ff, 0x0000ff00, 0x0000ffff, 0x00ff0000, 0x00ff00ff,
@ -174,7 +176,8 @@ void PicaCore::WriteInternalReg(u32 id, u32 value, u32 mask, bool& stop_requeste
// TODO(PabloMK7): This logic is not fully accurate, but close enough:
// https://problemkaputt.de/gbatek-3ds-gpu-internal-registers-finalize-interrupt-registers.htm
if (any_byte_match(regs.internal.reg_array[id], regs.internal.irq_compare)) [[likely]] {
signal_interrupt(Service::GSP::InterruptId::P3D);
signal_interrupt(Service::GSP::InterruptId::P3D,
delay_generator.CalculateAndResetDelay());
if (regs.internal.irq_autostop) [[likely]] {
stop_requested = true;
}
@ -551,6 +554,10 @@ void PicaCore::DrawArrays(bool is_indexed) {
return accelerate_draw;
}();
// Add vertices to the delay generator.
delay_generator.AddVertices(regs.internal.pipeline.num_vertices,
regs.internal.pipeline.triangle_topology);
// Attempt to use hardware vertex shaders if possible.
if (accelerate_draw && rasterizer->AccelerateDrawBatch(is_indexed)) {
return;

View file

@ -29,6 +29,86 @@ namespace Pica {
class DebugContext;
class ShaderEngine;
class DelayGenerator {
private:
// A GPU is a very complex system, the timings resulting from
// a 3D draw depend on many factors, including triangle counts,
// texture sizes and format, shader complexity, cache
// and memory layout, etc. At this point in time, we don't
// have enough information nor implemented hw emulation
// capabilities to achieve a proper timing estimate.
//
// Instead, we will try to measure how complex a scene is based
// on the amount of geometry that is drawn, the amount of GPU
// commands and the shader complexity. We will ignore all
// the other factors for now.
// Using Mario Kart 7 as the reference, it is understood that on
// average the console can handle around 20k triangles per frame.
// This game uses standard GPU features, with no fancy stuff,
// so we can consider it an average. To prevent hurting performance,
// we will also assume the GPU is twice as powerful. Afterall we only
// want timing accuracy to fix bugs at this point.
// This average already takes into account shader complexity averages.
static constexpr float nanoseconds_per_triangle = 800.f / 2;
// Of the total amount of submitted triangles, many of them will be culled.
// This heavily depends on the specific scene, so we will assume 35% of the
// triangles being culled. Furthermore, the culled triangles will take way less
// processing time as they will skip most of the pipeline processing, so we
// can assume that a culled triangle will only take about 20% of the time.
static constexpr float culled_triangle_threshold = 0.35f; // 35%
static constexpr float culled_triangle_time_cost = 0.20f; // 20%
// We will assume that each command will take around 6 cycles @ 268MHz
// There are no real measurements to support this claim, but it sounds
// reasonable. TODO: Measure on real HW.
static constexpr float nanoseconds_per_command = 22.4f;
public:
inline void AddCommands(size_t commands) {
command_count += commands;
}
inline void AddVertices(size_t vertices, PipelineRegs::TriangleTopology topology) {
size_t triangles{};
if (topology == PipelineRegs::TriangleTopology::Fan ||
topology == PipelineRegs::TriangleTopology::Strip) {
triangles = (vertices >= 3) ? (vertices - 2) : 1;
} else {
// Geometry shaders produce more vertices per given vertex,
// but they are not that relevant for timing emulation.
triangles = vertices / 3;
}
triangle_count += triangles;
}
u64 CalculateAndResetDelay() {
float result = command_count * nanoseconds_per_command;
result += (1.f - culled_triangle_threshold) * triangle_count * nanoseconds_per_triangle;
result += culled_triangle_threshold * triangle_count *
(nanoseconds_per_triangle * culled_triangle_time_cost);
triangle_count = 0;
command_count = 0;
return static_cast<u64>(result);
}
private:
size_t triangle_count{};
size_t command_count{};
friend class boost::serialization::access;
template <class Archive>
void serialize(Archive& ar, const u32 file_version) {
ar & triangle_count;
ar & command_count;
}
};
class PicaCore {
public:
explicit PicaCore(Memory::MemorySystem& memory, std::shared_ptr<DebugContext> debug_context_);
@ -277,6 +357,8 @@ public:
AttributeBuffer input_default_attributes{};
ImmediateModeState immediate{};
DelayGenerator delay_generator{};
private:
friend class boost::serialization::access;
template <class Archive>
@ -291,6 +373,7 @@ private:
ar & fog;
ar & input_default_attributes;
ar & immediate;
ar & delay_generator;
ar & geometry_pipeline;
ar & primitive_assembler;
ar & cmd_list;

View file

@ -1,4 +1,4 @@
// Copyright 2023 Citra Emulator Project
// Copyright Citra Emulator Project / Azahar Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
@ -10,6 +10,7 @@
#include "common/assert.h"
#include "common/bit_field.h"
#include "common/common_funcs.h"
#include "core/memory.h"
namespace Pica {
@ -85,6 +86,11 @@ struct MemoryFillConfig {
return DecodeAddressRegister(address_end);
}
bool IsVRAM() const {
u32 addr = GetStartAddress();
return !(addr >= Memory::FCRAM_PADDR && addr < Memory::FCRAM_PADDR_END);
}
inline std::string DebugName() const {
return fmt::format("from {:#X} to {:#X} with {}-bit value {:#X}", GetStartAddress(),
GetEndAddress(), fill_32bit ? "32" : (fill_24bit ? "24" : "16"),
@ -155,6 +161,16 @@ struct DisplayTransferConfig {
input_width.Value(), output_width.Value());
}
bool IsInputVRAM() {
u32 addr = GetPhysicalInputAddress();
return !(addr >= Memory::FCRAM_PADDR && addr < Memory::FCRAM_PADDR_END);
}
bool IsOutputVRAM() {
u32 addr = GetPhysicalOutputAddress();
return !(addr >= Memory::FCRAM_PADDR && addr < Memory::FCRAM_PADDR_END);
}
union {
u32 output_size;