From a0e7d4bf50726529e045ebbff846be6946a4e59d Mon Sep 17 00:00:00 2001 From: xbzk Date: Fri, 29 May 2026 20:48:56 -0300 Subject: [PATCH] [ffmpeg,android] proper detection of HW codec + vic fixes + ffmpeg debug callback stuff --- src/video_core/host1x/codecs/decoder.cpp | 71 ++--- src/video_core/host1x/ffmpeg.cpp | 355 ++++++++++++++++++++++- src/video_core/host1x/ffmpeg.h | 28 +- src/video_core/host1x/nvdec.cpp | 1 + src/video_core/host1x/vic.cpp | 9 +- 5 files changed, 410 insertions(+), 54 deletions(-) diff --git a/src/video_core/host1x/codecs/decoder.cpp b/src/video_core/host1x/codecs/decoder.cpp index c75059db6f..c14d9bd3be 100644 --- a/src/video_core/host1x/codecs/decoder.cpp +++ b/src/video_core/host1x/codecs/decoder.cpp @@ -26,52 +26,43 @@ void Decoder::Decode() { } const auto packet_data = ComposeFrame(); - // Send assembled bitstream to decoder. - if (!decode_api.SendPacket(packet_data)) { - return; - } - // Only receive/store visible frames. - if (vp9_hidden_frame) { - return; - } - - // Receive output frames from decoder. - auto frame = decode_api.ReceiveFrame(); - - if (!frame) { - return; - } - - if (IsInterlaced()) { - auto [luma_top, luma_bottom, chroma_top, chroma_bottom] = GetInterlacedOffsets(); - auto frame_copy = frame; - - if (!frame.get()) { - LOG_ERROR(HW_GPU, - "Nvdec {} failed to decode interlaced frame for top {:#X} bottom 0x{:X}", id, - luma_top, luma_bottom); - } - - if (UsingDecodeOrder()) { - host1x.frame_queue.PushDecodeOrder(id, luma_top, std::move(frame)); - host1x.frame_queue.PushDecodeOrder(id, luma_bottom, std::move(frame_copy)); - } else { - host1x.frame_queue.PushPresentOrder(id, luma_top, std::move(frame)); - host1x.frame_queue.PushPresentOrder(id, luma_bottom, std::move(frame_copy)); - } + // Capture the slot offsets for this submission before sending; async HW + // decoders may not produce the corresponding frame until several iterations + // later, by which point GetProgressiveOffsets() returns a different slot. + FFmpeg::FrameOffsets offsets{}; + offsets.hidden = vp9_hidden_frame; + offsets.interlaced = IsInterlaced(); + if (offsets.interlaced) { + std::tie(offsets.luma, offsets.luma_bottom, offsets.chroma, offsets.chroma_bottom) = + GetInterlacedOffsets(); } else { - auto [luma_offset, chroma_offset] = GetProgressiveOffsets(); + std::tie(offsets.luma, offsets.chroma) = GetProgressiveOffsets(); + } - if (!frame.get()) { - LOG_ERROR(HW_GPU, "Nvdec {} failed to decode progressive frame for luma {:#X}", id, - luma_offset); - } + if (!decode_api.SendPacket(packet_data, offsets)) { + return; + } + auto push = [&](u64 luma, std::shared_ptr frame) { if (UsingDecodeOrder()) { - host1x.frame_queue.PushDecodeOrder(id, luma_offset, std::move(frame)); + host1x.frame_queue.PushDecodeOrder(id, luma, std::move(frame)); } else { - host1x.frame_queue.PushPresentOrder(id, luma_offset, std::move(frame)); + host1x.frame_queue.PushPresentOrder(id, luma, std::move(frame)); + } + }; + + while (auto result = decode_api.ReceiveFrame()) { + auto& [frame, o] = *result; + if (o.hidden || !frame) { + continue; + } + if (o.interlaced) { + auto frame_copy = frame; + push(o.luma, std::move(frame)); + push(o.luma_bottom, std::move(frame_copy)); + } else { + push(o.luma, std::move(frame)); } } } diff --git a/src/video_core/host1x/ffmpeg.cpp b/src/video_core/host1x/ffmpeg.cpp index 507e94f193..1e4c503755 100644 --- a/src/video_core/host1x/ffmpeg.cpp +++ b/src/video_core/host1x/ffmpeg.cpp @@ -4,6 +4,13 @@ // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include +#include +#include +#include + #include "common/assert.h" #include "common/logging.h" #include "common/scope_exit.h" @@ -19,6 +26,7 @@ extern "C" { #endif #include +#include } namespace FFmpeg { @@ -83,6 +91,252 @@ std::string AVError(int errnum) { return errbuf; } +void FFmpegLogCallback(void* avcl, int level, const char* fmt, va_list vl) { + if (level > av_log_get_level()) { + return; + } + char line[1024]; + int print_prefix = 1; + const int written = av_log_format_line2(avcl, level, fmt, vl, line, sizeof(line), &print_prefix); + if (written <= 0) { + return; + } + std::string_view msg(line, std::min(written, sizeof(line) - 1)); + while (!msg.empty() && (msg.back() == '\n' || msg.back() == '\r')) { + msg.remove_suffix(1); + } + if (msg.empty()) { + return; + } + // Surfaceless decode is the intended path; FFmpeg flags it as ERROR. + if (msg.find("Both surface and native_window are NULL") != std::string_view::npos) { + level = AV_LOG_INFO; + } + if (level <= AV_LOG_ERROR) { + LOG_ERROR(HW_GPU, "[ffmpeg] {}", msg); + } else if (level <= AV_LOG_WARNING) { + LOG_WARNING(HW_GPU, "[ffmpeg] {}", msg); + } else if (level <= AV_LOG_INFO) { + LOG_INFO(HW_GPU, "[ffmpeg] {}", msg); + } else { + LOG_DEBUG(HW_GPU, "[ffmpeg] {}", msg); + } +} + +void InstallFFmpegLogCallbackOnce() { + static std::once_flag flag; + std::call_once(flag, [] { + av_log_set_callback(FFmpegLogCallback); + }); +} + +#ifdef ANDROID +// Some MediaCodec impls (Qualcomm c2.qti) reject configure() with width=0, +// so we parse width/height out of the SPS ourselves and set them on the +// AVCodecContext before opening h264_mediacodec. +class H264BitReader { +public: + explicit H264BitReader(std::span data) { + m_rbsp.reserve(data.size()); + for (size_t i = 0; i < data.size(); ++i) { + if (i + 2 < data.size() && data[i] == 0 && data[i + 1] == 0 && data[i + 2] == 0x03) { + m_rbsp.push_back(data[i]); + m_rbsp.push_back(data[i + 1]); + i += 2; + } else { + m_rbsp.push_back(data[i]); + } + } + } + + u32 ReadBits(int n) { + u32 v = 0; + for (int i = 0; i < n; ++i) { + v <<= 1; + const size_t byte = m_bit_pos >> 3; + if (byte < m_rbsp.size()) { + v |= (m_rbsp[byte] >> (7 - (m_bit_pos & 7))) & 1u; + } + ++m_bit_pos; + } + return v; + } + + u32 ReadUE() { + int zeros = 0; + while (zeros < 32 && ReadBits(1) == 0) { + ++zeros; + } + return (1u << zeros) - 1u + ReadBits(zeros); + } + + s32 ReadSE() { + const u32 v = ReadUE(); + return (v & 1u) ? static_cast((v + 1u) / 2u) : -static_cast(v / 2u); + } + +private: + std::vector m_rbsp; + size_t m_bit_pos{}; +}; + +std::optional> ParseH264SpsDimensions(std::span sps_nalu) { + if (sps_nalu.size() < 4) { + return std::nullopt; + } + H264BitReader r(sps_nalu.subspan(1)); + + const u32 profile_idc = r.ReadBits(8); + r.ReadBits(16); // constraint flags + level_idc + r.ReadUE(); // seq_parameter_set_id + + switch (profile_idc) { + case 100: case 110: case 122: case 244: case 44: case 83: + case 86: case 118: case 128: case 138: case 139: case 134: case 135: { + const u32 chroma_format_idc = r.ReadUE(); + if (chroma_format_idc == 3) { + r.ReadBits(1); + } + r.ReadUE(); // bit_depth_luma_minus8 + r.ReadUE(); // bit_depth_chroma_minus8 + r.ReadBits(1); // qpprime_y_zero_transform_bypass_flag + if (r.ReadBits(1)) { + const int n = chroma_format_idc == 3 ? 12 : 8; + for (int i = 0; i < n; ++i) { + if (r.ReadBits(1)) { + const int size = i < 6 ? 16 : 64; + int last_scale = 8, next_scale = 8; + for (int j = 0; j < size; ++j) { + if (next_scale != 0) { + next_scale = (last_scale + r.ReadSE() + 256) % 256; + } + last_scale = next_scale != 0 ? next_scale : last_scale; + } + } + } + } + break; + } + default: + break; + } + + r.ReadUE(); // log2_max_frame_num_minus4 + const u32 pic_order_cnt_type = r.ReadUE(); + if (pic_order_cnt_type == 0) { + r.ReadUE(); + } else if (pic_order_cnt_type == 1) { + r.ReadBits(1); + r.ReadSE(); + r.ReadSE(); + const u32 n = r.ReadUE(); + for (u32 i = 0; i < n; ++i) { + r.ReadSE(); + } + } + r.ReadUE(); // max_num_ref_frames + r.ReadBits(1); // gaps_in_frame_num_value_allowed_flag + const u32 pic_width_in_mbs_minus1 = r.ReadUE(); + const u32 pic_height_in_map_units_minus1 = r.ReadUE(); + const u32 frame_mbs_only_flag = r.ReadBits(1); + if (!frame_mbs_only_flag) { + r.ReadBits(1); + } + r.ReadBits(1); // direct_8x8_inference_flag + + int width = static_cast((pic_width_in_mbs_minus1 + 1u) * 16u); + int height = static_cast((2u - frame_mbs_only_flag) * + (pic_height_in_map_units_minus1 + 1u) * 16u); + + if (r.ReadBits(1)) { // frame_cropping_flag + const u32 left = r.ReadUE(); + const u32 right = r.ReadUE(); + const u32 top = r.ReadUE(); + const u32 bottom = r.ReadUE(); + width -= static_cast((left + right) * 2u); + height -= static_cast((top + bottom) * 2u * (2u - frame_mbs_only_flag)); + } + + if (width <= 0 || height <= 0 || width > 8192 || height > 8192) { + return std::nullopt; + } + return std::pair{width, height}; +} + +// Match a 3- or 4-byte annex-B start code at `i`. Returns its length, or 0. +size_t MatchStartCode(std::span data, size_t i) { + const size_t n = data.size(); + if (i + 3 < n && data[i] == 0 && data[i + 1] == 0 && data[i + 2] == 0 && data[i + 3] == 1) { + return 4; + } + if (i + 2 < n && data[i] == 0 && data[i + 1] == 0 && data[i + 2] == 1) { + return 3; + } + return 0; +} + +// Pull SPS (NAL type 7) + PPS (NAL type 8) out of an annex-B frame into an +// extradata buffer, each prefixed with a 4-byte start code. Eden synthesizes +// these inline into the very first frame; h264_mediacodec wants them at open. +std::vector ExtractH264AnnexBExtradata(std::span packet) { + std::vector extradata; + const size_t size = packet.size(); + size_t i = 0; + while (i < size) { + const size_t sc = MatchStartCode(packet, i); + if (sc == 0) { + ++i; + continue; + } + const size_t nal_start = i + sc; + if (nal_start >= size) { + break; + } + const u8 nal_type = packet[nal_start] & 0x1F; + + size_t j = nal_start + 1; + while (j < size && MatchStartCode(packet, j) == 0) { + ++j; + } + + if (nal_type == 7 || nal_type == 8) { + constexpr u8 start[4] = {0, 0, 0, 1}; + extradata.insert(extradata.end(), std::begin(start), std::end(start)); + extradata.insert(extradata.end(), packet.begin() + nal_start, packet.begin() + j); + } else if (nal_type == 1 || nal_type == 5) { + break; + } + i = j; + } + return extradata; +} + +std::optional> ParseFirstSpsInAnnexB(std::span data) { + const size_t size = data.size(); + size_t i = 0; + while (i < size) { + const size_t sc = MatchStartCode(data, i); + if (sc == 0) { + ++i; + continue; + } + const size_t nal_start = i + sc; + if (nal_start >= size) { + break; + } + if ((data[nal_start] & 0x1F) == 7) { + size_t j = nal_start + 1; + while (j < size && MatchStartCode(data, j) == 0) { + ++j; + } + return ParseH264SpsDimensions(data.subspan(nal_start, j - nal_start)); + } + i = nal_start + 1; + } + return std::nullopt; +} +#endif + } Packet::Packet(std::span data) { @@ -117,7 +371,26 @@ Decoder::Decoder(Tegra::Host1x::NvdecCommon::VideoCodec codec) { return AV_CODEC_ID_NONE; } }(); - m_codec = avcodec_find_decoder(av_codec); + +#ifdef ANDROID + // FFmpeg exposes MediaCodec via dedicated decoders rather than as a + // hw_config on the regular ones. + if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Gpu) { + const char* mc_name = nullptr; + switch (av_codec) { + case AV_CODEC_ID_H264: mc_name = "h264_mediacodec"; break; + case AV_CODEC_ID_VP8: mc_name = "vp8_mediacodec"; break; + case AV_CODEC_ID_VP9: mc_name = "vp9_mediacodec"; break; + default: break; + } + if (mc_name) { + m_codec = avcodec_find_decoder_by_name(mc_name); + } + } +#endif + if (!m_codec) { + m_codec = avcodec_find_decoder(av_codec); + } } bool Decoder::SupportsDecodingOnDevice(AVPixelFormat* out_pix_fmt, AVHWDeviceType type) const { @@ -205,6 +478,9 @@ DecoderContext::DecoderContext(const Decoder& decoder) : m_decoder{decoder} { av_opt_set(m_codec_context->priv_data, "tune", "zerolatency", 0); m_codec_context->thread_count = 0; m_codec_context->thread_type &= ~FF_THREAD_FRAME; + // Forwarded into MediaCodec as KEY_LOW_LATENCY on Android. + m_codec_context->flags |= AV_CODEC_FLAG_LOW_DELAY; + m_codec_context->flags2 |= AV_CODEC_FLAG2_FAST; } DecoderContext::~DecoderContext() { @@ -218,7 +494,19 @@ void DecoderContext::InitializeHardwareDecoder(const HardwareContext& context, A m_codec_context->pix_fmt = hw_pix_fmt; } -bool DecoderContext::OpenContext(const Decoder& decoder) { +bool DecoderContext::OpenContext(const Decoder& decoder, std::span extradata) { + if (!extradata.empty()) { + av_freep(&m_codec_context->extradata); + m_codec_context->extradata = static_cast( + av_mallocz(extradata.size() + AV_INPUT_BUFFER_PADDING_SIZE)); + if (!m_codec_context->extradata) { + LOG_ERROR(HW_GPU, "Failed to allocate extradata"); + return false; + } + std::memcpy(m_codec_context->extradata, extradata.data(), extradata.size()); + m_codec_context->extradata_size = static_cast(extradata.size()); + } + if (const int ret = avcodec_open2(m_codec_context, decoder.GetCodec(), nullptr); ret < 0) { LOG_ERROR(HW_GPU, "avcodec_open2 error: {}", AVError(ret)); return false; @@ -278,10 +566,16 @@ void DecodeApi::Reset() { m_hardware_context.reset(); m_decoder_context.reset(); m_decoder.reset(); + m_opened = false; + m_needs_h264_extradata = false; + m_next_pts = 0; + while (!m_pending_offsets.empty()) { + m_pending_offsets.pop(); + } } bool DecodeApi::Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec) { - av_log_set_level(AV_LOG_DEBUG); + InstallFFmpegLogCallbackOnce(); this->Reset(); m_decoder.emplace(codec); @@ -293,23 +587,68 @@ bool DecodeApi::Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec) { m_hardware_context->InitializeForDecoder(*m_decoder_context, *m_decoder); } - // Open the decoder context. +#ifdef ANDROID + // h264_mediacodec needs SPS/PPS in extradata at open. We pull them from + // the first frame's bitstream in SendPacket. + m_needs_h264_extradata = m_decoder->GetCodec() && + std::string_view(m_decoder->GetCodec()->name) == "h264_mediacodec"; + if (m_needs_h264_extradata) { + return true; + } +#endif + if (!m_decoder_context->OpenContext(*m_decoder)) { this->Reset(); return false; } + m_opened = true; return true; } -bool DecodeApi::SendPacket(std::span packet_data) { +bool DecodeApi::SendPacket(std::span packet_data, const FrameOffsets& offsets) { + if (!m_opened) { + std::vector extradata; +#ifdef ANDROID + if (m_needs_h264_extradata) { + extradata = ExtractH264AnnexBExtradata(packet_data); + if (extradata.empty()) { + return true; + } + if (auto dims = ParseFirstSpsInAnnexB(extradata)) { + auto* ctx = m_decoder_context->GetCodecContext(); + ctx->width = dims->first; + ctx->height = dims->second; + ctx->coded_width = dims->first; + ctx->coded_height = dims->second; + } + } +#endif + if (!m_decoder_context->OpenContext(*m_decoder, extradata)) { + this->Reset(); + return false; + } + m_opened = true; + } + m_pending_offsets.push(offsets); FFmpeg::Packet packet(packet_data); + packet.GetPacket()->pts = m_next_pts; + packet.GetPacket()->dts = m_next_pts; + ++m_next_pts; return m_decoder_context->SendPacket(packet); } -std::shared_ptr DecodeApi::ReceiveFrame() { - // Receive raw frame from decoder. - return m_decoder_context->ReceiveFrame(); +std::optional DecodeApi::ReceiveFrame() { + auto frame = m_decoder_context->ReceiveFrame(); + if (!frame) { + return std::nullopt; + } + FrameOffsets offsets{}; + if (!m_pending_offsets.empty()) { + offsets = m_pending_offsets.front(); + m_pending_offsets.pop(); + } + return DecodedFrame{std::move(frame), offsets}; } } diff --git a/src/video_core/host1x/ffmpeg.h b/src/video_core/host1x/ffmpeg.h index fdb6908bb6..5ea0954951 100644 --- a/src/video_core/host1x/ffmpeg.h +++ b/src/video_core/host1x/ffmpeg.h @@ -8,9 +8,9 @@ #include #include +#include #include #include -#include #include "common/common_funcs.h" #include "common/common_types.h" @@ -179,7 +179,7 @@ public: ~DecoderContext(); void InitializeHardwareDecoder(const HardwareContext& context, AVPixelFormat hw_pix_fmt); - bool OpenContext(const Decoder& decoder); + bool OpenContext(const Decoder& decoder, std::span extradata = {}); bool SendPacket(const Packet& packet); std::shared_ptr ReceiveFrame(); @@ -198,6 +198,17 @@ private: bool m_decode_order{}; }; +// Slot offsets passed alongside a packet so the eventual frame can be routed +// back to the right DPB slot even when MediaCodec runs a few frames behind. +struct FrameOffsets { + bool interlaced{}; + bool hidden{}; + u64 luma{}; + u64 chroma{}; + u64 luma_bottom{}; + u64 chroma_bottom{}; +}; + class DecodeApi { public: YUZU_NON_COPYABLE(DecodeApi); @@ -213,13 +224,22 @@ public: return m_decoder_context->UsingDecodeOrder(); } - bool SendPacket(std::span packet_data); - std::shared_ptr ReceiveFrame(); + bool SendPacket(std::span packet_data, const FrameOffsets& offsets); + + struct DecodedFrame { + std::shared_ptr frame; + FrameOffsets offsets; + }; + std::optional ReceiveFrame(); private: std::optional m_decoder; std::optional m_decoder_context; std::optional m_hardware_context; + bool m_opened{}; + bool m_needs_h264_extradata{}; + s64 m_next_pts{}; + std::queue m_pending_offsets; }; } // namespace FFmpeg diff --git a/src/video_core/host1x/nvdec.cpp b/src/video_core/host1x/nvdec.cpp index f2e5c358d8..5dbc6a417e 100644 --- a/src/video_core/host1x/nvdec.cpp +++ b/src/video_core/host1x/nvdec.cpp @@ -31,6 +31,7 @@ Nvdec::Nvdec(Host1x& host1x_, s32 id_, u32 syncpt) Nvdec::~Nvdec() { LOG_INFO(HW_GPU, "Destroying nvdec {}", id); + host1x.frame_queue.Close(id); } void Nvdec::ProcessMethod(u32 method, u32 argument) { diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp index 906714cc16..4f636ae510 100644 --- a/src/video_core/host1x/vic.cpp +++ b/src/video_core/host1x/vic.cpp @@ -122,8 +122,13 @@ void Vic::Execute() noexcept { for (size_t i = 0; i < config.slot_structs.size(); i++) { if (auto& slot_config = config.slot_structs[i]; slot_config.config.slot_enable) { auto const luma_offset = regs.surfaces[i][SurfaceIndex::Current].luma.Address(); - if (nvdec_id == -1) + if (nvdec_id == -1) { nvdec_id = host1x.frame_queue.VicFindNvdecFdFromOffset(luma_offset); + if (nvdec_id != -1) { + LOG_INFO(HW_GPU, "Vic {} resolved nvdec_id={} for slot {} luma {:#X}", + id, nvdec_id, i, luma_offset); + } + } if (auto frame = host1x.frame_queue.GetFrame(nvdec_id, luma_offset); frame.get()) { switch (frame->GetPixelFormat()) { case AV_PIX_FMT_YUV420P: @@ -137,7 +142,7 @@ void Vic::Execute() noexcept { break; } Blend(config, slot_config, config.output_surface_config.out_pixel_format); - } else { + } else if (nvdec_id != -1) { LOG_ERROR(HW_GPU, "Vic {} failed to get frame with offset {:#X}", id, luma_offset); } }