shader_jit: Emit LG2/EX2 subroutines on-demand (#2046)
Some checks are pending
citra-build / source (push) Waiting to run
citra-build / linux-x86_64 (appimage) (push) Waiting to run
citra-build / linux-x86_64 (appimage-wayland) (push) Waiting to run
citra-build / linux-x86_64 (gcc-nopch) (push) Waiting to run
citra-build / linux-arm64 (clang) (push) Waiting to run
citra-build / linux-arm64 (gcc-nopch) (push) Waiting to run
citra-build / macos (push) Waiting to run
citra-build / windows (msvc) (push) Waiting to run
citra-build / windows (msys2) (push) Waiting to run
citra-build / android (googleplay) (push) Waiting to run
citra-build / android (vanilla) (push) Waiting to run
citra-build / docker (push) Waiting to run
citra-format / clang-format (push) Waiting to run
citra-libretro / android (push) Waiting to run
citra-libretro / linux (push) Waiting to run
citra-libretro / windows (push) Waiting to run
citra-libretro / macos (arm64) (push) Waiting to run
citra-libretro / macos (x86_64) (push) Waiting to run
citra-libretro / ios (push) Waiting to run
citra-libretro / tvos (push) Waiting to run
citra-transifex / transifex (push) Waiting to run

Rather than emitting these subroutine functions for _every_ shader, only emit
the subroutines when the `LG2` and `EX2` instructions are actually used.
This saves a good chunk of memory across all shaders.

Inspired by Tanuki3DS.
This commit is contained in:
Wunk 2026-04-24 11:34:46 -07:00 committed by GitHub
parent 37b6c91de6
commit 91128d6625
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 40 additions and 43 deletions

View file

@ -508,6 +508,7 @@ void JitShader::Compile_DPH(Instruction instr) {
void JitShader::Compile_EX2(Instruction instr) { void JitShader::Compile_EX2(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
STR(X30, SP, POST_INDEXED, -16); STR(X30, SP, POST_INDEXED, -16);
exp2_used = true;
BL(exp2_subroutine); BL(exp2_subroutine);
LDR(X30, SP, PRE_INDEXED, 16); LDR(X30, SP, PRE_INDEXED, 16);
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
@ -516,6 +517,7 @@ void JitShader::Compile_EX2(Instruction instr) {
void JitShader::Compile_LG2(Instruction instr) { void JitShader::Compile_LG2(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
STR(X30, SP, POST_INDEXED, -16); STR(X30, SP, POST_INDEXED, -16);
log2_used = true;
BL(log2_subroutine); BL(log2_subroutine);
LDR(X30, SP, PRE_INDEXED, 16); LDR(X30, SP, PRE_INDEXED, 16);
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
@ -994,6 +996,14 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
// Compile entire program // Compile entire program
Compile_Block(static_cast<u32>(program_code->size())); Compile_Block(static_cast<u32>(program_code->size()));
// Compile utility functions
if (log2_used) {
Compile_Log2(log2_subroutine);
}
if (exp2_used) {
Compile_Exp2(exp2_subroutine);
}
// Free memory that's no longer needed // Free memory that's no longer needed
program_code = nullptr; program_code = nullptr;
swizzle_data = nullptr; swizzle_data = nullptr;
@ -1021,18 +1031,9 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
code_vec.shrink_to_fit(); code_vec.shrink_to_fit();
} }
JitShader::JitShader() : oaknut::VectorCodeGenerator(code_vec) { JitShader::JitShader() : oaknut::VectorCodeGenerator(code_vec) {}
CompilePrelude();
}
void JitShader::CompilePrelude() {
log2_subroutine = CompilePrelude_Log2();
exp2_subroutine = CompilePrelude_Exp2();
}
Label JitShader::CompilePrelude_Log2() {
Label subroutine;
void JitShader::Compile_Log2(Label subroutine) {
// We perform this approximation by first performing a range reduction into the range // We perform this approximation by first performing a range reduction into the range
// [1.0, 2.0). A minimax polynomial which was fit for the function log2(x) / (x - 1) is then // [1.0, 2.0). A minimax polynomial which was fit for the function log2(x) / (x - 1) is then
// evaluated. We multiply the result by (x - 1) then restore the result into the appropriate // evaluated. We multiply the result by (x - 1) then restore the result into the appropriate
@ -1136,13 +1137,9 @@ Label JitShader::CompilePrelude_Log2() {
DUP(SRC1.S4(), SRC1.Selem()[0]); DUP(SRC1.S4(), SRC1.Selem()[0]);
RET(); RET();
return subroutine;
} }
Label JitShader::CompilePrelude_Exp2() { void JitShader::Compile_Exp2(Label subroutine) {
Label subroutine;
// This approximation first performs a range reduction into the range [-0.5, 0.5). A minmax // This approximation first performs a range reduction into the range [-0.5, 0.5). A minmax
// polynomial which was fit for the function exp2(x) is then evaluated. We then restore the // polynomial which was fit for the function exp2(x) is then evaluated. We then restore the
// result into the appropriate range. // result into the appropriate range.
@ -1241,8 +1238,6 @@ Label JitShader::CompilePrelude_Exp2() {
DUP(SRC1.S4(), SRC1.Selem()[0]); DUP(SRC1.S4(), SRC1.Selem()[0]);
RET(); RET();
return subroutine;
} }
} // namespace Pica::Shader } // namespace Pica::Shader

View file

@ -1,4 +1,4 @@
// Copyright 2023 Citra Emulator Project // Copyright Citra Emulator Project / Azahar Emulator Project
// Licensed under GPLv2 or any later version // Licensed under GPLv2 or any later version
// Refer to the license.txt file included. // Refer to the license.txt file included.
@ -123,9 +123,8 @@ private:
/** /**
* Emits data and code for utility functions. * Emits data and code for utility functions.
*/ */
void CompilePrelude(); void Compile_Log2(oaknut::Label subroutine);
oaknut::Label CompilePrelude_Log2(); void Compile_Exp2(oaknut::Label subroutine);
oaknut::Label CompilePrelude_Exp2();
const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code = nullptr; const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code = nullptr;
const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data = nullptr; const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data = nullptr;
@ -146,6 +145,10 @@ private:
using CompiledShader = void(const void* setup, void* state, const std::byte* start_addr); using CompiledShader = void(const void* setup, void* state, const std::byte* start_addr);
CompiledShader* program = nullptr; CompiledShader* program = nullptr;
/// Library functions, emitted as used
bool log2_used : 1 = false;
bool exp2_used : 1 = false;
oaknut::Label log2_subroutine; oaknut::Label log2_subroutine;
oaknut::Label exp2_subroutine; oaknut::Label exp2_subroutine;
}; };

View file

@ -511,12 +511,14 @@ void JitShader::Compile_DPH(Instruction instr) {
void JitShader::Compile_EX2(Instruction instr) { void JitShader::Compile_EX2(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
exp2_used = true;
call(exp2_subroutine); call(exp2_subroutine);
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }
void JitShader::Compile_LG2(Instruction instr) { void JitShader::Compile_LG2(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
log2_used = true;
call(log2_subroutine); call(log2_subroutine);
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }
@ -1038,6 +1040,14 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
// Compile entire program // Compile entire program
Compile_Block(static_cast<u32>(program_code->size())); Compile_Block(static_cast<u32>(program_code->size()));
// Compile utility functions
if (log2_used) {
Compile_Log2(log2_subroutine);
}
if (exp2_used) {
Compile_Exp2(exp2_subroutine);
}
// Free memory that's no longer needed // Free memory that's no longer needed
program_code = nullptr; program_code = nullptr;
swizzle_data = nullptr; swizzle_data = nullptr;
@ -1050,18 +1060,9 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
LOG_DEBUG(HW_GPU, "Compiled shader size={}", getSize()); LOG_DEBUG(HW_GPU, "Compiled shader size={}", getSize());
} }
JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) { JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {}
CompilePrelude();
}
void JitShader::CompilePrelude() {
log2_subroutine = CompilePrelude_Log2();
exp2_subroutine = CompilePrelude_Exp2();
}
Xbyak::Label JitShader::CompilePrelude_Log2() {
Xbyak::Label subroutine;
void JitShader::Compile_Log2(Xbyak::Label subroutine) {
// SSE does not have a log instruction, thus we must approximate. // SSE does not have a log instruction, thus we must approximate.
// We perform this approximation first performaing a range reduction into the range [1.0, 2.0). // We perform this approximation first performaing a range reduction into the range [1.0, 2.0).
// A minimax polynomial which was fit for the function log2(x) / (x - 1) is then evaluated. // A minimax polynomial which was fit for the function log2(x) / (x - 1) is then evaluated.
@ -1163,12 +1164,9 @@ Xbyak::Label JitShader::CompilePrelude_Log2() {
shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0));
ret(); ret();
return subroutine;
} }
Xbyak::Label JitShader::CompilePrelude_Exp2() { void JitShader::Compile_Exp2(Xbyak::Label subroutine) {
Xbyak::Label subroutine;
// SSE does not have a exp instruction, thus we must approximate. // SSE does not have a exp instruction, thus we must approximate.
// We perform this approximation first performaing a range reduction into the range [-0.5, 0.5). // We perform this approximation first performaing a range reduction into the range [-0.5, 0.5).
@ -1271,8 +1269,6 @@ Xbyak::Label JitShader::CompilePrelude_Exp2() {
shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0));
ret(); ret();
return subroutine;
} }
} // namespace Pica::Shader } // namespace Pica::Shader

View file

@ -1,4 +1,4 @@
// Copyright 2015 Citra Emulator Project // Copyright Citra Emulator Project / Azahar Emulator Project
// Licensed under GPLv2 or any later version // Licensed under GPLv2 or any later version
// Refer to the license.txt file included. // Refer to the license.txt file included.
@ -115,9 +115,8 @@ private:
/** /**
* Emits data and code for utility functions. * Emits data and code for utility functions.
*/ */
void CompilePrelude(); void Compile_Log2(Xbyak::Label subroutine);
Xbyak::Label CompilePrelude_Log2(); void Compile_Exp2(Xbyak::Label subroutine);
Xbyak::Label CompilePrelude_Exp2();
const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code = nullptr; const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code = nullptr;
const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data = nullptr; const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data = nullptr;
@ -138,6 +137,10 @@ private:
using CompiledShader = void(const void* setup, void* state, const u8* start_addr); using CompiledShader = void(const void* setup, void* state, const u8* start_addr);
CompiledShader* program = nullptr; CompiledShader* program = nullptr;
/// Library functions, emitted as used
bool log2_used : 1 = false;
bool exp2_used : 1 = false;
Xbyak::Label log2_subroutine; Xbyak::Label log2_subroutine;
Xbyak::Label exp2_subroutine; Xbyak::Label exp2_subroutine;
}; };