eden/src/video_core/host1x/vic.cpp

// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later

// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later

#include <array>
#include <tuple>
#include <stdint.h>

#if defined(ARCHITECTURE_x86_64)
#include <immintrin.h>
#endif

extern "C" {
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
#endif
#include <libswscale/swscale.h>
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic pop
#endif
}

#include "common/alignment.h"
#include "common/assert.h"
#include "common/bit_field.h"
#include "common/logging.h"
#include "common/polyfill_thread.h"
#include "common/settings.h"

#include "video_core/engines/maxwell_3d.h"
#include "video_core/guest_memory.h"
#include "video_core/host1x/host1x.h"
#include "video_core/host1x/nvdec.h"
#include "video_core/host1x/vic.h"
#include "video_core/memory_manager.h"
#include "video_core/textures/decoders.h"

#if defined(ARCHITECTURE_x86_64)
#include "common/cpu_features.h"
#endif

#if defined(ARCHITECTURE_x86_64) \
    && (defined(_MSC_VER) \
    || (defined(__GNUC__) && defined(__SSE4_1__)) \
    || (defined(__clang__) && defined(__SSE4_1__)))
#define COMPILED_HAS_SSE41 1
#else
#define COMPILED_HAS_SSE41 0
#endif

namespace Tegra::Host1x {
namespace {

static bool HasSSE41() {
#ifdef ARCHITECTURE_x86_64
    return Common::g_cpu_caps.sse4_1;
#else
    return false;
#endif
}

void SwizzleSurface(std::span<u8> output, u32 out_stride, std::span<const u8> input, u32 in_stride, u32 height) noexcept {
    //// Taken from https://github.com/averne/FFmpeg/blob/nvtegra/libavutil/hwcontext_nvtegra.c#L949
    /// Can only handle block height == 1.
    u32 const x_mask = 0xFFFFFFD2u, y_mask = 0x2Cu;
    u32 offs_x = 0, offs_y = 0;
    for (size_t y = 0; y < height; y += 2) {
        auto dst_line = output.data() + offs_y * 16;
        auto const src_line = input.data() + u32(y) * (in_stride / 16) * 16;

        auto offs_line = offs_x;
        for (u32 x = 0; x < in_stride; x += 16) {
            std::memcpy(&dst_line[offs_line * 16], &src_line[x], 16);
            std::memcpy(&dst_line[offs_line * 16 + 16], &src_line[x + in_stride], 16);
            offs_line = (offs_line - x_mask) & x_mask;
        }
        offs_y = (offs_y - y_mask) & y_mask;
        offs_x += offs_y ? 0 : out_stride; // Wrap into next tile row
    }
}

} // namespace

Vic::Vic(Host1x& host1x_, s32 id_, u32 syncpt) noexcept
    : CDmaPusher{host1x_, id_}
    , id{id_}
    , syncpoint{syncpt}
{
    LOG_INFO(HW_GPU, "Created vic {}", id);
}

Vic::~Vic() noexcept {
    LOG_INFO(HW_GPU, "Destroying vic {}", id);
    host1x.frame_queue.Close(id);
}

void Vic::ProcessMethod(u32 method, u32 arg) noexcept {
    LOG_TRACE(HW_GPU, "Vic {} method {:#X}", id, u32(method));
    regs.reg_array[method] = arg;
    switch (Method(method * sizeof(u32))) {
    case Method::Execute:
        Execute();
        break;
    default:
        break;
    }
}

void Vic::Execute() noexcept {
    ConfigStruct config{};
    host1x.gmmu_manager.ReadBlock(regs.config_struct_offset.Address(), &config, sizeof(ConfigStruct));

    auto output_width = config.output_surface_config.out_surface_width + 1;
    auto output_height = config.output_surface_config.out_surface_height + 1;
    output_surface.resize(output_width * output_height);

    if (Settings::values.nvdec_emulation.GetValue() != Settings::NvdecEmulation::Off) {
        for (size_t i = 0; i < config.slot_structs.size(); i++) {
            if (auto& slot_config = config.slot_structs[i]; slot_config.config.slot_enable) {
                auto const luma_offset = regs.surfaces[i][SurfaceIndex::Current].luma.Address();
                if (nvdec_id == -1)
                    nvdec_id = host1x.frame_queue.VicFindNvdecFdFromOffset(luma_offset);
                if (auto frame = host1x.frame_queue.GetFrame(nvdec_id, luma_offset); frame.get()) {
                    switch (frame->GetPixelFormat()) {
                    case AV_PIX_FMT_YUV420P:
                        ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame), true);
                        break;
                    case AV_PIX_FMT_NV12:
                        ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame), false);
                        break;
                    default:
                        UNIMPLEMENTED_MSG("Unimplemented slot pixel format {}", u32(slot_config.surface_config.slot_pixel_format.Value()));
                        break;
                    }
                    Blend(config, slot_config, config.output_surface_config.out_pixel_format);
                } else {
                    LOG_ERROR(HW_GPU, "Vic {} failed to get frame with offset {:#X}", id, luma_offset);
                }
            }
        }
    } else {
        // Fill the frame with black, as otherwise they can have random data and be very glitchy.
        std::fill(output_surface.begin(), output_surface.end(), Pixel{});
    }

    auto const format = config.output_surface_config.out_pixel_format;
    switch (format) {
    case VideoPixelFormat::A8B8G8R8:
    case VideoPixelFormat::X8B8G8R8:
    case VideoPixelFormat::A8R8G8B8:
        WriteABGR(config.output_surface_config, format);
        break;
    case VideoPixelFormat::Y8__V8U8_N420:
        WriteY8__V8U8_N420(config.output_surface_config);
        break;
    default:
        UNIMPLEMENTED_MSG("Unknown video pixel format {}", format.Value());
        break;
    }
}

void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar, bool interlaced) noexcept {
    auto const out_luma_width = slot.surface_config.slot_surface_width + 1;
    auto const out_luma_height = (slot.surface_config.slot_surface_height + 1) * (interlaced ? 2 : 1);
    auto const out_luma_stride = out_luma_width;

    auto const in_luma_width = (std::min)(frame->GetWidth(), s32(out_luma_width));
    auto const in_luma_height = (std::min)(frame->GetHeight(), s32(out_luma_height));
    auto const in_luma_stride = frame->GetStride(0);

    auto const in_chroma_stride{frame->GetStride(1)};

    auto const* luma_buffer{frame->GetPlane(0)};
    auto const* chroma_u_buffer{frame->GetPlane(1)};
    auto const* chroma_v_buffer{frame->GetPlane(2)};

    LOG_TRACE(HW_GPU,
        "Reading frame"
        "\ninput luma {}x{} stride {} chroma {}x{} stride {}\n"
        "output luma {}x{} stride {} chroma {}x{} stride {}",
        in_luma_width, in_luma_height, in_luma_stride, in_luma_width / 2, in_luma_height / 2,
        in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride, out_luma_width,
        out_luma_height, out_luma_stride);

    slot_surface.resize_destructive(out_luma_width * out_luma_height);
    if (COMPILED_HAS_SSE41 && HasSSE41() && in_luma_width % 16 == 0) {
#if COMPILED_HAS_SSE41
        auto const alpha_linear = u16(slot.config.planar_alpha.Value());
        auto const alpha = _mm_slli_epi64(_mm_set1_epi64x(s64(slot.config.planar_alpha.Value())), 48);

        auto const shuffle_mask = _mm_set_epi8(13, 15, 14, 12, 9, 11, 10, 8, 5, 7, 6, 4, 1, 3, 2, 0);
        auto const sse_aligned_width = Common::AlignDown(in_luma_width, 16);

        for (s32 y = 0; y < in_luma_height; y++) {
            auto const src_luma{y * in_luma_stride};
            auto const src_chroma{(y / 2) * in_chroma_stride};
            auto const dst{y * out_luma_stride};
            s32 x = 0;
            for (; x < sse_aligned_width; x += 16) {
                // clang-format off
                // Prefetch next iteration's memory
                _mm_prefetch((const char*)&luma_buffer[src_luma + x + 16], _MM_HINT_T0);

                // Load 8 bytes * 2 of 8-bit luma samples
                // luma0 = 00 00 00 00 00 00 00 00 LL LL LL LL LL LL LL LL
                auto luma0 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 0]);
                auto luma1 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 8]);

                __m128i chroma;

                if (planar) {
                    _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0);
                    _mm_prefetch((const char*)&chroma_v_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0);

                    // If Chroma is planar, we have separate U and V planes, load 8 bytes of each
                    // chroma_u0 = 00 00 00 00 00 00 00 00 UU UU UU UU UU UU UU UU
                    // chroma_v0 = 00 00 00 00 00 00 00 00 VV VV VV VV VV VV VV VV
                    auto chroma_u0 = _mm_loadl_epi64((__m128i*)&chroma_u_buffer[src_chroma + x / 2]);
                    auto chroma_v0 = _mm_loadl_epi64((__m128i*)&chroma_v_buffer[src_chroma + x / 2]);

                    // Interleave the 8 bytes of U and V into a single 16 byte reg
                    // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU
                    chroma = _mm_unpacklo_epi8(chroma_u0, chroma_v0);
                } else {
                    _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0);

                    // Chroma is already interleaved in semiplanar format, just load 16 bytes
                    // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU
                    chroma = _mm_load_si128((__m128i*)&chroma_u_buffer[src_chroma + x]);
                }

                // Convert the low 8 bytes of 8-bit luma into 16-bit luma
                // luma0 = [00] [00] [00] [00] [00] [00] [00] [00] [LL] [LL] [LL] [LL] [LL] [LL] [LL] [LL]
                // ->
                // luma0 = [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL]
                luma0 = _mm_cvtepu8_epi16(luma0);
                luma1 = _mm_cvtepu8_epi16(luma1);

                // Treat the 8 bytes of 8-bit chroma as 16-bit channels, this allows us to take both the
                // U and V together as one element. Using chroma twice here duplicates the values, as we
                // take element 0 from chroma, and then element 0 from chroma again, etc. We need to
                // duplicate chroma horitonally as chroma is half the width of luma.
                // chroma   = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1]
                // ->
                // chroma00 = [VV4 UU4] [VV4 UU4] [VV3 UU3] [VV3 UU3] [VV2 UU2] [VV2 UU2] [VV1 UU1] [VV1 UU1]
                // chroma01 = [VV8 UU8] [VV8 UU8] [VV7 UU7] [VV7 UU7] [VV6 UU6] [VV6 UU6] [VV5 UU5] [VV5 UU5]
                auto chroma00 = _mm_unpacklo_epi16(chroma, chroma);
                auto chroma01 = _mm_unpackhi_epi16(chroma, chroma);

                // Interleave the 16-bit luma and chroma.
                // luma0    = [008 LL8] [007 LL7] [006 LL6] [005 LL5] [004 LL4] [003 LL3] [002 LL2] [001 LL1]
                // chroma00 = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1]
                // ->
                // yuv0     = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1]
                // yuv1     = [VV8 UU8 008 LL8] [VV7 UU7 007 LL7] [VV6 UU6 006 LL6] [VV5 UU5 005 LL5]
                auto yuv0 = _mm_unpacklo_epi16(luma0, chroma00);
                auto yuv1 = _mm_unpackhi_epi16(luma0, chroma00);
                auto yuv2 = _mm_unpacklo_epi16(luma1, chroma01);
                auto yuv3 = _mm_unpackhi_epi16(luma1, chroma01);

                // Shuffle the luma/chroma into the channel ordering we actually want. The high byte of
                // the luma which is now a constant 0 after converting 8-bit -> 16-bit is used as the
                // alpha. Luma -> R, U -> G, V -> B, 0 -> A
                // yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1]
                // ->
                // yuv0 = [AA4 VV4 UU4 LL4] [AA3 VV3 UU3 LL3] [AA2 VV2 UU2 LL2] [AA1 VV1 UU1 LL1]
                yuv0 = _mm_shuffle_epi8(yuv0, shuffle_mask);
                yuv1 = _mm_shuffle_epi8(yuv1, shuffle_mask);
                yuv2 = _mm_shuffle_epi8(yuv2, shuffle_mask);
                yuv3 = _mm_shuffle_epi8(yuv3, shuffle_mask);

                // Extend the 8-bit channels we have into 16-bits, as that's the target surface format.
                // Since this turns just the low 8 bytes into 16 bytes, the second of
                // each operation here right shifts the register by 8 to get the high pixels.
                // yuv0  = [AA4] [VV4] [UU4] [LL4] [AA3] [VV3] [UU3] [LL3] [AA2] [VV2] [UU2] [LL2] [AA1] [VV1] [UU1] [LL1]
                // ->
                // yuv01 = [002 AA2] [002 VV2] [002 UU2] [002 LL2] [001 AA1] [001 VV1] [001 UU1] [001 LL1]
                // yuv23 = [004 AA4] [004 VV4] [004 UU4] [004 LL4] [003 AA3] [003 VV3] ]003 UU3] [003 LL3]
                auto yuv01 = _mm_cvtepu8_epi16(yuv0);
                auto yuv23 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv0, 8));
                auto yuv45 = _mm_cvtepu8_epi16(yuv1);
                auto yuv67 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv1, 8));
                auto yuv89 = _mm_cvtepu8_epi16(yuv2);
                auto yuv1011 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv2, 8));
                auto yuv1213 = _mm_cvtepu8_epi16(yuv3);
                auto yuv1415 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv3, 8));

                // Left-shift all 16-bit channels by 2, this is to get us into a 10-bit format instead
                // of 8, which is the format alpha is in, as well as other blending values.
                yuv01 = _mm_slli_epi16(yuv01, 2);
                yuv23 = _mm_slli_epi16(yuv23, 2);
                yuv45 = _mm_slli_epi16(yuv45, 2);
                yuv67 = _mm_slli_epi16(yuv67, 2);
                yuv89 = _mm_slli_epi16(yuv89, 2);
                yuv1011 = _mm_slli_epi16(yuv1011, 2);
                yuv1213 = _mm_slli_epi16(yuv1213, 2);
                yuv1415 = _mm_slli_epi16(yuv1415, 2);

                // OR in the planar alpha, this has already been duplicated and shifted into position,
                // and just fills in the AA channels with the actual alpha value.
                yuv01 = _mm_or_si128(yuv01, alpha);
                yuv23 = _mm_or_si128(yuv23, alpha);
                yuv45 = _mm_or_si128(yuv45, alpha);
                yuv67 = _mm_or_si128(yuv67, alpha);
                yuv89 = _mm_or_si128(yuv89, alpha);
                yuv1011 = _mm_or_si128(yuv1011, alpha);
                yuv1213 = _mm_or_si128(yuv1213, alpha);
                yuv1415 = _mm_or_si128(yuv1415, alpha);

                // Store out the pixels. One pixel is now 8 bytes, so each store is 2 pixels.
                // [AA AA] [VV VV] [UU UU] [LL LL] [AA AA] [VV VV] [UU UU] [LL LL]
                _mm_store_si128((__m128i*)&slot_surface[dst + x + 0], yuv01);
                _mm_store_si128((__m128i*)&slot_surface[dst + x + 2], yuv23);
                _mm_store_si128((__m128i*)&slot_surface[dst + x + 4], yuv45);
                _mm_store_si128((__m128i*)&slot_surface[dst + x + 6], yuv67);
                _mm_store_si128((__m128i*)&slot_surface[dst + x + 8], yuv89);
                _mm_store_si128((__m128i*)&slot_surface[dst + x + 10], yuv1011);
                _mm_store_si128((__m128i*)&slot_surface[dst + x + 12], yuv1213);
                _mm_store_si128((__m128i*)&slot_surface[dst + x + 14], yuv1415);
            }
            for (; x < in_luma_width; x++) {
                slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2);
                // Chroma samples are duplicated horizontally and vertically.
                if (planar) {
                    slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2);
                    slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2);
                } else {
                    slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
                    slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
                }
                slot_surface[dst + x].a = alpha_linear;
            }
        }
#endif
    } else {
        auto const alpha = u16(slot.config.planar_alpha.Value());
        for (size_t y = 0; y < size_t(in_luma_height); y++) {
            auto const src_luma = y * in_luma_stride;
            auto const src_chroma = (y / 2) * in_chroma_stride;
            auto const dst = y * out_luma_stride;
            for (size_t x = 0; x < size_t(in_luma_width); x++) {
                slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2);
                // Chroma samples are duplicated horizontally and vertically.
                if (planar) {
                    slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2);
                    slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2);
                } else {
                    slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
                    slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
                }
                slot_surface[dst + x].a = alpha;
            }
        }
    }
}

void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar, bool top_field) noexcept {
    if(planar) {
        auto const out_luma_width{slot.surface_config.slot_surface_width + 1};
        auto const out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2};
        auto const out_luma_stride{out_luma_width};

        slot_surface.resize(out_luma_width * out_luma_height);

        auto const in_luma_width = (std::min)(frame->GetWidth(), s32(out_luma_width));
        [[maybe_unused]] auto const in_luma_height = (std::min)(frame->GetHeight(), s32(out_luma_height));
        auto const in_luma_stride{frame->GetStride(0)};

        [[maybe_unused]] auto const in_chroma_width = (frame->GetWidth() + 1) / 2;
        auto const in_chroma_height = (frame->GetHeight() + 1) / 2;
        auto const in_chroma_stride{frame->GetStride(1)};

        auto const* luma_buffer{frame->GetPlane(0)};
        auto const* chroma_u_buffer{frame->GetPlane(1)};
        auto const* chroma_v_buffer{frame->GetPlane(2)};

        LOG_TRACE(HW_GPU, "Reading frame"
            "\ninput luma {}x{} stride {} chroma {}x{} stride {}\n"
            "output luma {}x{} stride {} chroma {}x{} stride {}",
            in_luma_width, in_luma_height, in_luma_stride, in_chroma_width, in_chroma_height,
            in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride,
            out_luma_width / 2, out_luma_height / 2, out_luma_stride);

        auto DecodeBobField = [&]() {
            auto const alpha = u16(slot.config.planar_alpha.Value());
            for (s32 y = s32(top_field == false); y < in_chroma_height * 2; y += 2) {
                auto const src_luma{y * in_luma_stride};
                auto const src_chroma{(y / 2) * in_chroma_stride};
                auto const dst{y * out_luma_stride};
                for (s32 x = 0; x < in_luma_width; x++) {
                    slot_surface[dst + x].r = u16(luma_buffer[src_luma + x] << 2);
                    if(planar) {
                        slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + x / 2] << 2);
                        slot_surface[dst + x].b = u16(chroma_v_buffer[src_chroma + x / 2] << 2);
                    } else {
                        slot_surface[dst + x].g = u16(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2);
                        slot_surface[dst + x].b = u16(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2);
                    }
                    slot_surface[dst + x].a = alpha;
                }
                s32 other_line = (top_field ? y + 1 : y - 1) * out_luma_stride;
                std::memcpy(&slot_surface[other_line], &slot_surface[dst], out_luma_width * sizeof(Pixel));
            }
        };

        switch (slot.config.deinterlace_mode) {
        case DXVAHD_DEINTERLACE_MODE_PRIVATE::WEAVE:
            // Due to the fact that we do not write to memory in nvdec, we cannot use Weave as it
            // relies on the previous frame.
            DecodeBobField();
            break;
        case DXVAHD_DEINTERLACE_MODE_PRIVATE::BOB_FIELD:
            DecodeBobField();
            break;
        case DXVAHD_DEINTERLACE_MODE_PRIVATE::DISI1:
            // Due to the fact that we do not write to memory in nvdec, we cannot use DISI1 as it
            // relies on previous/next frames.
            DecodeBobField();
            break;
        default:
            LOG_ERROR(HW_GPU, "Deinterlace mode {} not implemented!", s32(slot.config.deinterlace_mode.Value()));
            break;
        }
    } else {
        ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame), planar, true);
    }
}

void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets, std::shared_ptr<const FFmpeg::Frame> frame, bool planar) noexcept {
    switch (slot.config.frame_format) {
    case DXVAHD_FRAME_FORMAT::PROGRESSIVE:
        ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame), planar, false);
        break;
    case DXVAHD_FRAME_FORMAT::TOP_FIELD:
        ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame), planar, true);
        break;
    case DXVAHD_FRAME_FORMAT::BOTTOM_FIELD:
        ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame), planar, false);
        break;
    default:
        LOG_ERROR(HW_GPU, "Unknown deinterlace format {}", s32(slot.config.frame_format.Value()));
        break;
    }
}

void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot, VideoPixelFormat format) noexcept {
    auto const add_one = [](u32 const v) -> u32 {
        return v + u32(v > 0);
    };

    auto source_left = add_one(u32(slot.config.source_rect_left.Value()));
    auto source_right = add_one(u32(slot.config.source_rect_right.Value()));
    auto source_top = add_one(u32(slot.config.source_rect_top.Value()));
    auto source_bottom = add_one(u32(slot.config.source_rect_bottom.Value()));

    auto const dest_left = add_one(u32(slot.config.dest_rect_left.Value()));
    auto const dest_right = add_one(u32(slot.config.dest_rect_right.Value()));
    auto const dest_top = add_one(u32(slot.config.dest_rect_top.Value()));
    auto const dest_bottom = add_one(u32(slot.config.dest_rect_bottom.Value()));

    auto rect_left = add_one(u32(config.output_config.target_rect_left.Value()));
    auto rect_right = add_one(u32(config.output_config.target_rect_right.Value()));
    auto rect_top = add_one(u32(config.output_config.target_rect_top.Value()));
    auto rect_bottom = add_one(u32(config.output_config.target_rect_bottom.Value()));

    rect_left = (std::max)(rect_left, dest_left);
    rect_right = (std::min)(rect_right, dest_right);
    rect_top = (std::max)(rect_top, dest_top);
    rect_bottom = (std::min)(rect_bottom, dest_bottom);

    source_left = (std::max)(source_left, rect_left);
    source_right = (std::min)(source_right, rect_right);
    source_top = (std::max)(source_top, rect_top);
    source_bottom = (std::min)(source_bottom, rect_bottom);

    auto const out_surface_width = config.output_surface_config.out_surface_width + 1;
    auto const out_surface_height = config.output_surface_config.out_surface_height + 1;
    auto const in_surface_width = slot.surface_config.slot_surface_width + 1;

    source_bottom = (std::min)(source_bottom, out_surface_height);
    source_right = (std::min)(source_right, out_surface_width);

    [[maybe_unused]] auto const work_width = u32((std::max)(0, s32(source_right) - s32(source_left)));
    [[maybe_unused]] auto const work_height = u32((std::max)(0, s32(source_bottom) - s32(source_top)));

    // TODO Alpha blending. No games I've seen use more than a single surface or supply an alpha
    // below max, so it's ignored for now.
    if (slot.color_matrix.matrix_enable) {
        if (COMPILED_HAS_SSE41 && HasSSE41() && source_left % 8 == 0 && source_right % 8 == 0) {
            // MSVC doesn't define __SSE4_1__
#if COMPILED_HAS_SSE41
            // Fill the columns, e.g
            // c0 = [00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0]
            auto const c0 = _mm_set_epi32(0, s32(slot.color_matrix.matrix_coeff20.Value()), s32(slot.color_matrix.matrix_coeff10.Value()), s32(slot.color_matrix.matrix_coeff00.Value()));
            auto const c1 = _mm_set_epi32(0, s32(slot.color_matrix.matrix_coeff21.Value()), s32(slot.color_matrix.matrix_coeff11.Value()), s32(slot.color_matrix.matrix_coeff01.Value()));
            auto const c2 = _mm_set_epi32(0, s32(slot.color_matrix.matrix_coeff22.Value()), s32(slot.color_matrix.matrix_coeff12.Value()), s32(slot.color_matrix.matrix_coeff02.Value()));
            auto const c3 = _mm_set_epi32(0, s32(slot.color_matrix.matrix_coeff23.Value()), s32(slot.color_matrix.matrix_coeff13.Value()), s32(slot.color_matrix.matrix_coeff03.Value()));

            // Set the matrix right-shift as a single element.
            auto const shift = _mm_set_epi32(0, 0, 0, s32(slot.color_matrix.matrix_r_shift.Value()));

            // Set every 16-bit value to the soft clamp values for clamping every 16-bit channel.
            auto const clamp_min = _mm_set1_epi16(u16(slot.config.soft_clamp_low.Value()));
            auto const clamp_max = _mm_set1_epi16(u16(slot.config.soft_clamp_high.Value()));

            // clang-format off

            auto MatMul = [](__m128i& p, const __m128i& col0, const __m128i& col1, const __m128i& col2, const __m128i& col3, const __m128i& trm_shift) -> __m128i {
                // Duplicate the 32-bit channels, e.g
                // p = [AA AA AA AA] [BB BB BB BB] [GG GG GG GG] [RR RR RR RR]
                // ->
                // r = [RR4 RR4 RR4 RR4] [RR3 RR3 RR3 RR3] [RR2 RR2 RR2 RR2] [RR1 RR1 RR1 RR1]
                auto r = _mm_shuffle_epi32(p, 0x0);
                auto g = _mm_shuffle_epi32(p, 0x55);
                auto b = _mm_shuffle_epi32(p, 0xAA);

                // Multiply the rows and columns c0 * r, c1 * g, c2 * b, e.g
                // r  = [RR4 RR4 RR4 RR4] [ RR3  RR3  RR3  RR3] [ RR2  RR2  RR2  RR2] [ RR1  RR1  RR1  RR1]
                //                                             *
                // c0 = [ 00  00  00  00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0]
                r = _mm_mullo_epi32(r, col0);
                g = _mm_mullo_epi32(g, col1);
                b = _mm_mullo_epi32(b, col2);

                // Add them all together vertically, such that the 32-bit element
                // out[0] = (r[0] * c0[0]) + (g[0] * c1[0]) + (b[0] * c2[0])
                auto out = _mm_add_epi32(_mm_add_epi32(r, g), b);

                // Shift the result by r_shift, as the TRM says
                out = _mm_sra_epi32(out, trm_shift);

                // Add the final column. Because the 4x1 matrix has this row as 1, there's no need to
                // multiply by it, and as per the TRM this column ignores r_shift, so it's just added
                // here after shifting.
                out = _mm_add_epi32(out, col3);

                // Shift the result back from S12.8 to integer values
                return _mm_srai_epi32(out, 8);
            };

            for (u32 y = source_top; y < source_bottom; y++) {
                auto const src{y * in_surface_width + source_left};
                auto const dst{y * out_surface_width + rect_left};
                for (u32 x = source_left; x < source_right; x += 8) {
                    // clang-format off
                    // Prefetch the next iteration's memory
                    _mm_prefetch((const char*)&slot_surface[src + x + 8], _MM_HINT_T0);

                    // Load in pixels
                    // p01 = [AA AA] [BB BB] [GG GG] [RR RR] [AA AA] [BB BB] [GG GG] [RR RR]
                    auto p01 = _mm_load_si128((__m128i*)&slot_surface[src + x + 0]);
                    auto p23 = _mm_load_si128((__m128i*)&slot_surface[src + x + 2]);
                    auto p45 = _mm_load_si128((__m128i*)&slot_surface[src + x + 4]);
                    auto p67 = _mm_load_si128((__m128i*)&slot_surface[src + x + 6]);

                    // Convert the 16-bit channels into 32-bit (unsigned), as the matrix values are
                    // 32-bit and to avoid overflow.
                    // p01    = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
                    // ->
                    // p01_lo = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1]
                    // p01_hi = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2]
                    auto p01_lo = _mm_cvtepu16_epi32(p01);
                    auto p01_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p01, 8));
                    auto p23_lo = _mm_cvtepu16_epi32(p23);
                    auto p23_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p23, 8));
                    auto p45_lo = _mm_cvtepu16_epi32(p45);
                    auto p45_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p45, 8));
                    auto p67_lo = _mm_cvtepu16_epi32(p67);
                    auto p67_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p67, 8));

                    // Matrix multiply the pixel, doing the colour conversion.
                    auto out0 = MatMul(p01_lo, c0, c1, c2, c3, shift);
                    auto out1 = MatMul(p01_hi, c0, c1, c2, c3, shift);
                    auto out2 = MatMul(p23_lo, c0, c1, c2, c3, shift);
                    auto out3 = MatMul(p23_hi, c0, c1, c2, c3, shift);
                    auto out4 = MatMul(p45_lo, c0, c1, c2, c3, shift);
                    auto out5 = MatMul(p45_hi, c0, c1, c2, c3, shift);
                    auto out6 = MatMul(p67_lo, c0, c1, c2, c3, shift);
                    auto out7 = MatMul(p67_hi, c0, c1, c2, c3, shift);

                    // Pack the 32-bit channel pixels back into 16-bit using unsigned saturation
                    // out0  = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1]
                    // out1  = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2]
                    // ->
                    // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
                    auto done0 = _mm_packus_epi32(out0, out1);
                    auto done1 = _mm_packus_epi32(out2, out3);
                    auto done2 = _mm_packus_epi32(out4, out5);
                    auto done3 = _mm_packus_epi32(out6, out7);

                    // Blend the original alpha back into the pixel, as the matrix multiply gives us a
                    // 3-channel output, not 4.
                    // 0x88 = b10001000, taking RGB from the first argument, A from the second argument.
                    // done0 = [002 002] [BB2 BB2] [GG2 GG2] [RR2 RR2] [001 001] [BB1 BB1] [GG1 GG1] [RR1 RR1]
                    // ->
                    // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
                    done0 = _mm_blend_epi16(done0, p01, 0x88);
                    done1 = _mm_blend_epi16(done1, p23, 0x88);
                    done2 = _mm_blend_epi16(done2, p45, 0x88);
                    done3 = _mm_blend_epi16(done3, p67, 0x88);

                    // Clamp the 16-bit channels to the soft-clamp min/max.
                    done0 = _mm_max_epu16(done0, clamp_min);
                    done1 = _mm_max_epu16(done1, clamp_min);
                    done2 = _mm_max_epu16(done2, clamp_min);
                    done3 = _mm_max_epu16(done3, clamp_min);

                    done0 = _mm_min_epu16(done0, clamp_max);
                    done1 = _mm_min_epu16(done1, clamp_max);
                    done2 = _mm_min_epu16(done2, clamp_max);
                    done3 = _mm_min_epu16(done3, clamp_max);

                    // Store the pixels to the output surface.
                    _mm_store_si128((__m128i*)&output_surface[dst + x + 0], done0);
                    _mm_store_si128((__m128i*)&output_surface[dst + x + 2], done1);
                    _mm_store_si128((__m128i*)&output_surface[dst + x + 4], done2);
                    _mm_store_si128((__m128i*)&output_surface[dst + x + 6], done3);

                }
            }
#endif
        } else {
            // Colour conversion is enabled, this is a 3x4 * 4x1 matrix multiplication, resulting in a 3x1 matrix.
            // | r0c0 r0c1 r0c2 r0c3 |   | R |   | R |
            // | r1c0 r1c1 r1c2 r1c3 | * | G | = | G |
            // | r2c0 r2c1 r2c2 r2c3 |   | B |   | B |
            //                           | 1 |
            const auto r0c0 = s32(slot.color_matrix.matrix_coeff00.Value());
            const auto r0c1 = s32(slot.color_matrix.matrix_coeff01.Value());
            const auto r0c2 = s32(slot.color_matrix.matrix_coeff02.Value());
            const auto r0c3 = s32(slot.color_matrix.matrix_coeff03.Value());
            const auto r1c0 = s32(slot.color_matrix.matrix_coeff10.Value());
            const auto r1c1 = s32(slot.color_matrix.matrix_coeff11.Value());
            const auto r1c2 = s32(slot.color_matrix.matrix_coeff12.Value());
            const auto r1c3 = s32(slot.color_matrix.matrix_coeff13.Value());
            const auto r2c0 = s32(slot.color_matrix.matrix_coeff20.Value());
            const auto r2c1 = s32(slot.color_matrix.matrix_coeff21.Value());
            const auto r2c2 = s32(slot.color_matrix.matrix_coeff22.Value());
            const auto r2c3 = s32(slot.color_matrix.matrix_coeff23.Value());
            const auto shift = s32(slot.color_matrix.matrix_r_shift.Value());
            const auto clamp_min = s32(slot.config.soft_clamp_low.Value());
            const auto clamp_max = s32(slot.config.soft_clamp_high.Value());
            auto MatMul = [&](const Pixel& in_pixel) -> std::tuple<s32, s32, s32, s32> {
                auto r = s32(in_pixel.r);
                auto g = s32(in_pixel.g);
                auto b = s32(in_pixel.b);
                r = in_pixel.r * r0c0 + in_pixel.g * r0c1 + in_pixel.b * r0c2;
                g = in_pixel.r * r1c0 + in_pixel.g * r1c1 + in_pixel.b * r1c2;
                b = in_pixel.r * r2c0 + in_pixel.g * r2c1 + in_pixel.b * r2c2;
                r >>= shift;
                g >>= shift;
                b >>= shift;
                r += r0c3;
                g += r1c3;
                b += r2c3;
                r >>= 8;
                g >>= 8;
                b >>= 8;
                return {r, g, b, s32(in_pixel.a)};
            };
            for (u32 y = source_top; y < source_bottom; y++) {
                const auto src{y * in_surface_width + source_left};
                const auto dst{y * out_surface_width + rect_left};
                for (u32 x = source_left; x < source_right; x++) {
                    auto [r, g, b, a] = MatMul(slot_surface[src + x]);
                    r = std::clamp(r, clamp_min, clamp_max);
                    g = std::clamp(g, clamp_min, clamp_max);
                    b = std::clamp(b, clamp_min, clamp_max);
                    a = std::clamp(a, clamp_min, clamp_max);
                    output_surface[dst + x] = {u16(r), u16(g), u16(b), u16(a)};
                }
            }
        }
    } else {
        auto const copy_width = (std::min)(source_right - source_left, rect_right - rect_left);
        for (u32 y = 0; y < work_height; y++) {
            auto const dst_line = (y + source_top) * out_surface_width;
            auto const src_line = (y + source_top) * in_surface_width;
            std::memcpy(&output_surface[dst_line + rect_left], &slot_surface[src_line + source_left], copy_width * sizeof(Pixel));
        }
    }
}

void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) noexcept {
    constexpr u32 BytesPerPixel = 1;

    auto surface_width{output_surface_config.out_surface_width + 1};
    auto surface_height{output_surface_config.out_surface_height + 1};
    auto const surface_stride{surface_width};

    auto const out_luma_width = output_surface_config.out_luma_width + 1;
    auto const out_luma_height = output_surface_config.out_luma_height + 1;
    auto const out_luma_stride = Common::AlignUp(out_luma_width * BytesPerPixel, 0x10);
    auto const out_luma_size = out_luma_height * out_luma_stride;

    auto const out_chroma_width = output_surface_config.out_chroma_width + 1;
    auto const out_chroma_height = output_surface_config.out_chroma_height + 1;
    auto const out_chroma_stride = Common::AlignUp(out_chroma_width * BytesPerPixel * 2, 0x10);
    auto const out_chroma_size = out_chroma_height * out_chroma_stride;

    surface_width = (std::min)(surface_width, out_luma_width);
    surface_height = (std::min)(surface_height, out_luma_height);

    auto Decode = [&](u8* out_luma, u8* out_chroma) {
        if (COMPILED_HAS_SSE41 && HasSSE41() && surface_width % 16 == 0) {
#if COMPILED_HAS_SSE41
            // luma_mask   = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF]
            auto const luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1);
            auto const sse_aligned_width = Common::AlignDown(surface_width, 16);
            for (u32 y = 0; y < surface_height; ++y) {
                auto const src = y * surface_stride;
                auto const dst_luma = y * out_luma_stride;
                auto const dst_chroma = (y / 2) * out_chroma_stride;
                u32 x = 0;
                for (; x < sse_aligned_width; x += 16) {
                    // clang-format off
                    // Prefetch the next cache lines, 2 per iteration
                    _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0);
                    _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0);

                    // Load the 64-bit pixels, 2 per variable.
                    auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]);
                    auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]);
                    auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]);
                    auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]);
                    auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]);
                    auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]);
                    auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]);
                    auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]);

                    // Split out the luma of each pixel using the luma_mask above.
                    // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1]
                    // ->
                    //     l01 = [002 002] [002 002] [002 002] [LL2 LL2] [001 001] [001 001] [001 001] [LL1 LL1]
                    auto l01 = _mm_and_si128(pixel01, luma_mask);
                    auto l23 = _mm_and_si128(pixel23, luma_mask);
                    auto l45 = _mm_and_si128(pixel45, luma_mask);
                    auto l67 = _mm_and_si128(pixel67, luma_mask);
                    auto l89 = _mm_and_si128(pixel89, luma_mask);
                    auto l1011 = _mm_and_si128(pixel1011, luma_mask);
                    auto l1213 = _mm_and_si128(pixel1213, luma_mask);
                    auto l1415 = _mm_and_si128(pixel1415, luma_mask);

                    // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register.
                    // l01   = [002 002 002 002] [002 002 LL2 LL2] [001 001 001 001] [001 001 LL1 LL1]
                    // l23   = [004 004 004 004] [004 004 LL4 LL4] [003 003 003 003] [003 003 LL3 LL3]
                    // ->
                    // l0123 = [004 004] [LL4 LL4] [003 003] [LL3 LL3] [002 002] [LL2 LL2] [001 001] [LL1 LL1]
                    auto l0123 = _mm_packus_epi32(l01, l23);
                    auto l4567 = _mm_packus_epi32(l45, l67);
                    auto l891011 = _mm_packus_epi32(l89, l1011);
                    auto l12131415 = _mm_packus_epi32(l1213, l1415);

                    // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register.
                    // l0123   = [004 004 LL4 LL4] [003 003 LL3 LL3] [002 002 LL2 LL2] [001 001 LL1 LL1]
                    // l4567   = [008 008 LL8 LL8] [007 007 LL7 LL7] [006 006 LL6 LL6] [005 005 LL5 LL5]
                    // ->
                    // luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1]
                    auto luma_lo = _mm_packus_epi32(l0123, l4567);
                    auto luma_hi = _mm_packus_epi32(l891011, l12131415);

                    // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read
                    // and bringing the range back to 8-bit.
                    luma_lo = _mm_srli_epi16(luma_lo, 2);
                    luma_hi = _mm_srli_epi16(luma_hi, 2);

                    // Pack with unsigned saturation the 16-bit values in 2 registers into 8-bit values in 1 register.
                    // luma_lo =  [LL8  LL8]  [LL7  LL7]  [LL6  LL6]  [LL5  LL5]  [LL4  LL4]  [LL3  LL3]  [LL2  LL2] [LL1 LL1]
                    // luma_hi = [LL16 LL16] [LL15 LL15] [LL14 LL14] [LL13 LL13] [LL12 LL12] [LL11 LL11] [LL10 LL10] [LL9 LL9]
                    // ->
                    // luma = [LL16] [LL15] [LL14] [LL13] [LL12] [LL11] [LL10] [LL9] [LL8] [LL7] [LL6] [LL5] [LL4] [LL3] [LL2] [LL1]
                    auto luma = _mm_packus_epi16(luma_lo, luma_hi);

                    // Store the 16 bytes of luma
                    _mm_store_si128((__m128i*)&out_luma[dst_luma + x], luma);

                    if (y % 2 == 0) {
                        // Chroma, done every other line as it's half the height of luma.

                        // Shift the register right by 2 bytes (not bits), to kick out the 16-bit luma.
                        // We can do this instead of &'ing a mask and then shifting.
                        // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1]
                        // ->
                        //     c01 = [ 00  00] [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1]
                        auto c01 = _mm_srli_si128(pixel01, 2);
                        auto c23 = _mm_srli_si128(pixel23, 2);
                        auto c45 = _mm_srli_si128(pixel45, 2);
                        auto c67 = _mm_srli_si128(pixel67, 2);
                        auto c89 = _mm_srli_si128(pixel89, 2);
                        auto c1011 = _mm_srli_si128(pixel1011, 2);
                        auto c1213 = _mm_srli_si128(pixel1213, 2);
                        auto c1415 = _mm_srli_si128(pixel1415, 2);

                        // Interleave the lower 8 bytes as 32-bit elements from 2 registers into 1 register.
                        // This has the effect of skipping every other chroma value horitonally,
                        // notice the high pixels UU2/UU4 are skipped.
                        // This is intended as N420 chroma width is half the luma width.
                        // c01   = [ 00  00 AA2 AA2] [VV2 VV2 UU2 UU2] [LL2 LL2 AA1 AA1] [VV1 VV1 UU1 UU1]
                        // c23   = [ 00  00 AA4 AA4] [VV4 VV4 UU4 UU4] [LL4 LL4 AA3 AA3] [VV3 VV3 UU3 UU3]
                        // ->
                        // c0123 = [LL4 LL4 AA3 AA3] [LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3] [VV1 VV1 UU1 UU1]
                        auto c0123 = _mm_unpacklo_epi32(c01, c23);
                        auto c4567 = _mm_unpacklo_epi32(c45, c67);
                        auto c891011 = _mm_unpacklo_epi32(c89, c1011);
                        auto c12131415 = _mm_unpacklo_epi32(c1213, c1415);

                        // Interleave the low 64-bit elements from 2 registers into 1.
                        // c0123     = [LL4 LL4 AA3 AA3 LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1]
                        // c4567     = [LL8 LL8 AA7 AA7 LL6 LL6 AA5 AA5] [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5]
                        // ->
                        // chroma_lo = [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1]
                        auto chroma_lo = _mm_unpacklo_epi64(c0123, c4567);
                        auto chroma_hi = _mm_unpacklo_epi64(c891011, c12131415);

                        // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read
                        // and bringing the range back to 8-bit.
                        chroma_lo = _mm_srli_epi16(chroma_lo, 2);
                        chroma_hi = _mm_srli_epi16(chroma_hi, 2);

                        // Pack with unsigned saturation the 16-bit elements from 2 registers into 8-bit elements in 1 register.
                        // chroma_lo = [ VV7  VV7] [ UU7  UU7] [ VV5  VV5] [ UU5  UU5] [ VV3  VV3] [ UU3  UU3] [VV1 VV1] [UU1 UU1]
                        // chroma_hi = [VV15 VV15] [UU15 UU15] [VV13 VV13] [UU13 UU13] [VV11 VV11] [UU11 UU11] [VV9 VV9] [UU9 UU9]
                        // ->
                        // chroma    = [VV15] [UU15] [VV13] [UU13] [VV11] [UU11] [VV9] [UU9] [VV7] [UU7] [VV5] [UU5] [VV3] [UU3] [VV1] [UU1]
                        auto chroma = _mm_packus_epi16(chroma_lo, chroma_hi);

                        // Store the 16 bytes of chroma.
                        _mm_store_si128((__m128i*)&out_chroma[dst_chroma + x + 0], chroma);
                    }

                    // clang-format on
                }

                auto const src_chroma = y * surface_stride;
                for (; x < surface_width; x += 2) {
                    out_luma[dst_luma + x + 0] = u8(output_surface[src + x + 0].r >> 2);
                    out_luma[dst_luma + x + 1] = u8(output_surface[src + x + 1].r >> 2);
                    out_chroma[dst_chroma + x + 0] = u8(output_surface[src_chroma + x].g >> 2);
                    out_chroma[dst_chroma + x + 1] = u8(output_surface[src_chroma + x].b >> 2);
                }
            }
#endif
        } else {
            for (size_t y = 0; y < surface_height; ++y) {
                auto const src_luma = y * surface_stride;
                auto const dst_luma = y * out_luma_stride;
                auto const src_chroma = y * surface_stride;
                auto const dst_chroma = (y / 2) * out_chroma_stride;
                for (size_t x = 0; x < surface_width; x += 2) {
                    out_luma[dst_luma + x + 0] = u8(output_surface[src_luma + x + 0].r >> 2);
                    out_luma[dst_luma + x + 1] = u8(output_surface[src_luma + x + 1].r >> 2);
                    out_chroma[dst_chroma + x + 0] = u8(output_surface[src_chroma + x].g >> 2);
                    out_chroma[dst_chroma + x + 1] = u8(output_surface[src_chroma + x].b >> 2);
                }
            }
        }
    };

    switch (output_surface_config.out_block_kind) {
    case BLK_KIND::GENERIC_16Bx2: {
        u32 const block_height = u32(output_surface_config.out_block_height);
        auto const out_luma_swizzle_size = Texture::CalculateSize(true, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0);
        auto const out_chroma_swizzle_size = Texture::CalculateSize(true, BytesPerPixel * 2, out_chroma_width, out_chroma_height, 1, block_height, 0);

        LOG_TRACE(HW_GPU, "Writing Y8__V8U8_N420 swizzled frame\n"
            "\tinput surface {}x{} stride {} size {:#X}\n"
            "\toutput   luma {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}\n",
            "\toutput chroma {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}",
            surface_width, surface_height, surface_stride * BytesPerPixel,
            surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
            out_luma_stride, out_luma_size, block_height, out_luma_swizzle_size, out_chroma_width,
            out_chroma_height, out_chroma_stride, out_chroma_size, block_height,
            out_chroma_swizzle_size);

        luma_scratch.resize_destructive(out_luma_size);
        chroma_scratch.resize_destructive(out_chroma_size);
        Decode(luma_scratch.data(), chroma_scratch.data());

        Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(host1x.gmmu_manager, regs.output_surface.luma.Address(), out_luma_swizzle_size, &swizzle_scratch);
        Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_chroma(host1x.gmmu_manager, regs.output_surface.chroma_u.Address(), out_chroma_swizzle_size, &swizzle_scratch);
        if (block_height == 1) {
            SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, out_luma_height);
            SwizzleSurface(out_chroma, out_chroma_stride, chroma_scratch, out_chroma_stride, out_chroma_height);
        } else {
            Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0, 1);
            Texture::SwizzleTexture(out_chroma, chroma_scratch, BytesPerPixel, out_chroma_width, out_chroma_height, 1, block_height, 0, 1);
        }
    } break;
    case BLK_KIND::PITCH: {
        LOG_TRACE(HW_GPU, "Writing Y8__V8U8_N420 swizzled frame\n"
            "\tinput surface {}x{} stride {} size {:#X}\n"
            "\toutput   luma {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}\n",
            "\toutput chroma {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}",
            surface_width, surface_height, surface_stride * BytesPerPixel,
            surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
            out_luma_stride, out_luma_size, out_chroma_width, out_chroma_height, out_chroma_stride,
            out_chroma_size);

        // Unfortunately due to a driver bug or game bug, the chroma address can be not
        // appropriately spaced from the luma, so the luma of size out_stride * height runs into the
        // top of the chroma buffer. Unfortunately that removes an optimisation here where we could
        // create guest spans and decode into game memory directly to avoid the memory copy from
        // scratch to game. Due to this bug, we must write the luma first, and then the chroma
        // afterwards to re-overwrite the luma being too large.
        luma_scratch.resize_destructive(out_luma_size);
        chroma_scratch.resize_destructive(out_chroma_size);
        Decode(luma_scratch.data(), chroma_scratch.data());
        host1x.gmmu_manager.WriteBlock(regs.output_surface.luma.Address(), luma_scratch.data(), out_luma_size);
        host1x.gmmu_manager.WriteBlock(regs.output_surface.chroma_u.Address(), chroma_scratch.data(), out_chroma_size);
    } break;
    default:
        UNREACHABLE();
        break;
    }
}

void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config, VideoPixelFormat format) noexcept {
    constexpr u32 BytesPerPixel = 4;

    auto surface_width = output_surface_config.out_surface_width + 1;
    auto surface_height = output_surface_config.out_surface_height + 1;
    auto const surface_stride = surface_width;

    auto const out_luma_width = output_surface_config.out_luma_width + 1;
    auto const out_luma_height = output_surface_config.out_luma_height + 1;
    auto const out_luma_stride = Common ::AlignUp(out_luma_width * BytesPerPixel, 0x10);
    auto const out_luma_size = out_luma_height * out_luma_stride;

    surface_width = (std::min)(surface_width, out_luma_width);
    surface_height = (std::min)(surface_height, out_luma_height);

    auto Decode = [&](u8* out, Pixel const* inp) {
        if (COMPILED_HAS_SSE41 && HasSSE41() && surface_width % 16 == 0) {
#if COMPILED_HAS_SSE41
            auto const sse_aligned_width = Common::AlignDown(surface_width, 16);
            for (u32 y = 0; y < surface_height; y++) {
                auto const src = y * surface_stride;
                auto const dst = y * out_luma_stride;
                u32 x = 0;
                for (; x < sse_aligned_width; x += 16) {
                    // Prefetch the next 2 cache lines
                    _mm_prefetch((const char*)&inp[src + x + 16], _MM_HINT_T0);
                    _mm_prefetch((const char*)&inp[src + x + 24], _MM_HINT_T0);

                    // Load the pixels, 16-bit channels, 8 bytes per pixel, e.g
                    // pixel01 = [AA AA BB BB GG GG RR RR AA AA BB BB GG GG RR RR
                    auto pixel01 = _mm_load_si128((__m128i*)&inp[src + x + 0]);
                    auto pixel23 = _mm_load_si128((__m128i*)&inp[src + x + 2]);
                    auto pixel45 = _mm_load_si128((__m128i*)&inp[src + x + 4]);
                    auto pixel67 = _mm_load_si128((__m128i*)&inp[src + x + 6]);
                    auto pixel89 = _mm_load_si128((__m128i*)&inp[src + x + 8]);
                    auto pixel1011 = _mm_load_si128((__m128i*)&inp[src + x + 10]);
                    auto pixel1213 = _mm_load_si128((__m128i*)&inp[src + x + 12]);
                    auto pixel1415 = _mm_load_si128((__m128i*)&inp[src + x + 14]);

                    // Right-shift the channels by 16 to un-do the left shit on read and bring the range
                    // back to 8-bit.
                    pixel01 = _mm_srli_epi16(pixel01, 2);
                    pixel23 = _mm_srli_epi16(pixel23, 2);
                    pixel45 = _mm_srli_epi16(pixel45, 2);
                    pixel67 = _mm_srli_epi16(pixel67, 2);
                    pixel89 = _mm_srli_epi16(pixel89, 2);
                    pixel1011 = _mm_srli_epi16(pixel1011, 2);
                    pixel1213 = _mm_srli_epi16(pixel1213, 2);
                    pixel1415 = _mm_srli_epi16(pixel1415, 2);

                    // Pack with unsigned saturation 16-bit channels from 2 registers into 8-bit channels in 1 register.
                    // pixel01    = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1]
                    // pixel23    = [AA4 AA4] [BB4 BB4] [GG4 GG4] [RR4 RR4] [AA3 AA3] [BB3 BB3] [GG3 GG3] [RR3 RR3]
                    // ->
                    // pixels0_lo = [AA4] [BB4] [GG4] [RR4] [AA3] [BB3] [GG3] [RR3] [AA2] [BB2] [GG2] [RR2] [AA1] [BB1] [GG1] [RR1]
                    auto pixels0_lo = _mm_packus_epi16(pixel01, pixel23);
                    auto pixels0_hi = _mm_packus_epi16(pixel45, pixel67);
                    auto pixels1_lo = _mm_packus_epi16(pixel89, pixel1011);
                    auto pixels1_hi = _mm_packus_epi16(pixel1213, pixel1415);

                    if (format == VideoPixelFormat::A8R8G8B8) {
                        auto const shuffle = _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2);

                        // Our pixels are ABGR (big-endian) by default, if ARGB is needed, we need to shuffle.
                        // pixels0_lo = [AA4 BB4 GG4 RR4] [AA3 BB3 GG3 RR3] [AA2 BB2 GG2 RR2] [AA1 BB1 GG1 RR1]
                        // ->
                        // pixels0_lo = [AA4 RR4 GG4 BB4] [AA3 RR3 GG3 BB3] [AA2 RR2 GG2 BB2] [AA1 RR1 GG1 BB1]
                        pixels0_lo = _mm_shuffle_epi8(pixels0_lo, shuffle);
                        pixels0_hi = _mm_shuffle_epi8(pixels0_hi, shuffle);
                        pixels1_lo = _mm_shuffle_epi8(pixels1_lo, shuffle);
                        pixels1_hi = _mm_shuffle_epi8(pixels1_hi, shuffle);
                    }

                    // Store the pixels
                    _mm_store_si128((__m128i*)&out[dst + x * 4 + 0], pixels0_lo);
                    _mm_store_si128((__m128i*)&out[dst + x * 4 + 16], pixels0_hi);
                    _mm_store_si128((__m128i*)&out[dst + x * 4 + 32], pixels1_lo);
                    _mm_store_si128((__m128i*)&out[dst + x * 4 + 48], pixels1_hi);
                }
                for (; x < surface_width; x++) {
                    if (format == VideoPixelFormat::A8R8G8B8) {
                        out[dst + x * 4 + 0] = u8(inp[src + x].b >> 2);
                        out[dst + x * 4 + 1] = u8(inp[src + x].g >> 2);
                        out[dst + x * 4 + 2] = u8(inp[src + x].r >> 2);
                        out[dst + x * 4 + 3] = u8(inp[src + x].a >> 2);
                    } else {
                        out[dst + x * 4 + 0] = u8(inp[src + x].r >> 2);
                        out[dst + x * 4 + 1] = u8(inp[src + x].g >> 2);
                        out[dst + x * 4 + 2] = u8(inp[src + x].b >> 2);
                        out[dst + x * 4 + 3] = u8(inp[src + x].a >> 2);
                    }
                }
            }
#endif
        } else {
            for (size_t y = 0; y < surface_height; ++y) {
                auto const src = y * surface_stride, dst = y * out_luma_stride;
                for (size_t x = 0; x < surface_width; ++x) {
                    if(format == VideoPixelFormat::A8R8G8B8) {
                        out[dst + x * 4 + 0] = u8(inp[src + x].b >> 2);
                        out[dst + x * 4 + 1] = u8(inp[src + x].g >> 2);
                        out[dst + x * 4 + 2] = u8(inp[src + x].r >> 2);
                        out[dst + x * 4 + 3] = u8(inp[src + x].a >> 2);
                    } else {
                        out[dst + x * 4 + 0] = u8(inp[src + x].r >> 2);
                        out[dst + x * 4 + 1] = u8(inp[src + x].g >> 2);
                        out[dst + x * 4 + 2] = u8(inp[src + x].b >> 2);
                        out[dst + x * 4 + 3] = u8(inp[src + x].a >> 2);
                    }
                }
            }
        }
    };

    switch (output_surface_config.out_block_kind) {
    case BLK_KIND::GENERIC_16Bx2: {
        const u32 block_height = u32(output_surface_config.out_block_height);
        auto const out_swizzle_size = Texture::CalculateSize(true, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0);
        LOG_TRACE(HW_GPU, "Writing ABGR swizzled frame\n"
            "\tinput surface {}x{} stride {} size {:#X}\n"
            "\toutput surface {}x{} stride {} size {:#X} block height {} swizzled size 0x{:X}",
            surface_width, surface_height, surface_stride * BytesPerPixel,
            surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
            out_luma_stride, out_luma_size, block_height, out_swizzle_size);
        luma_scratch.resize_destructive(out_luma_size);
        Decode(luma_scratch.data(), output_surface.data());

        Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(host1x.gmmu_manager, regs.output_surface.luma.Address(), out_swizzle_size, &swizzle_scratch);
        if (block_height == 1) {
            SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, out_luma_height);
        } else {
            Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0, 1);
        }
    } break;
    case BLK_KIND::PITCH: {
        LOG_TRACE(HW_GPU, "Writing ABGR pitch frame\n"
            "\tinput surface {}x{} stride {} size {:#X}"
            "\toutput surface {}x{} stride {} size {:#X}",
            surface_width, surface_height, surface_stride,
            surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height,
            out_luma_stride, out_luma_size);
        luma_scratch.resize_destructive(out_luma_size);
        Tegra::Memory::GpuGuestMemoryScoped<u8, Core::Memory::GuestMemoryFlags::SafeWrite> out_luma(host1x.gmmu_manager, regs.output_surface.luma.Address(), out_luma_size, &luma_scratch);
        Decode(out_luma.data(), output_surface.data());
    } break;
    default:
        UNREACHABLE();
        break;
    }
}

} // namespace Tegra::Host1x