From 0e1c0c66250645e2290e3549acc8e3fd1776dba3 Mon Sep 17 00:00:00 2001
From: alper-han <89567766+alper-han@users.noreply.github.com>
Date: Thu, 19 Feb 2026 08:10:39 +0300
Subject: [PATCH] jpeg: reconstruct baseline bitstream for NVDEC

Rebuild a minimal JFIF stream from VA-API JPEG buffers (PicParam/IQMatrix/DHT/Slice)
and submit it to NVDEC as a single slice. Validate slice bounds and fall back to
standard Huffman tables when VA tables are missing/invalid.
---
 src/jpeg.c      | 753 ++++++++++++++++++++++++++++++++++++++++++++++--
 src/vabackend.c |   6 +
 src/vabackend.h |   3 +
 3 files changed, 741 insertions(+), 21 deletions(-)

diff --git a/src/jpeg.c b/src/jpeg.c
index abd45e0f..b210df52 100644
--- a/src/jpeg.c
+++ b/src/jpeg.c
@@ -1,41 +1,745 @@
 #include "vabackend.h"
+
+#include <stdint.h>
+#include <stdlib.h>
 #include <string.h>
 
-/* This one looks difficult to implement as NVDEC wants the whole JPEG file, and VA-API only supplied part of it */
+/* JPEG Decode Implementation
+ *
+ * VA-API supplies JPEG data as separate buffers (picture params, IQ tables,
+ * Huffman tables, slice data). NVDEC expects a complete JPEG bitstream.
+ * This codec reconstructs a minimal JFIF-compliant JPEG from VA buffers and
+ * feeds it to NVDEC.
+ */
+
+// JPEG marker bytes (JPEG spec: ISO/IEC 10918-1)
+#define JPEG_SOI        0xD8    // Start of Image
+#define JPEG_EOI        0xD9    // End of Image
+#define JPEG_APP0       0xE0    // JFIF application marker
+#define JPEG_DQT        0xDB    // Define Quantization Table
+#define JPEG_SOF0       0xC0    // Start of Frame (Baseline DCT)
+#define JPEG_DHT        0xC4    // Define Huffman Table
+#define JPEG_DRI        0xDD    // Define Restart Interval
+#define JPEG_SOS        0xDA    // Start of Scan
+#define JPEG_MARKER     0xFF    // Marker prefix byte
+
+#define JPEG_MAX_COMPONENTS 4U
+
+typedef struct {
+    VAPictureParameterBufferJPEGBaseline picParams;
+    VAIQMatrixBufferJPEGBaseline         iqMatrix;
+    VAHuffmanTableBufferJPEGBaseline     huffmanTable;
+    bool                                 hasPicParams;
+    bool                                 hasIQMatrix;
+    bool                                 hasHuffmanTable;
+    uint8_t                              validQuantTablesMask;
+    uint8_t                              validHuffmanDcMask;
+    uint8_t                              validHuffmanAcMask;
+} JPEGContext;
+
+// Minimal APP0/JFIF header
+static const uint8_t jfifHeader[] = {
+    JPEG_MARKER, JPEG_SOI,           // Start of Image
+    JPEG_MARKER, JPEG_APP0,          // APP0 marker
+    0x00, 0x10,                      // Length (16 bytes)
+    0x4A, 0x46, 0x49, 0x46, 0x00,    // "JFIF\0"
+    0x01, 0x01,                      // Version 1.1
+    0x00,                            // Units (0 = none)
+    0x00, 0x01,                      // X density
+    0x00, 0x01,                      // Y density
+    0x00, 0x00                       // No thumbnail
+};
+
+// Standard Huffman tables for baseline JPEG (YCbCr)
+static const uint8_t dcLuminanceBits[]   = {0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0};
+static const uint8_t dcLuminanceVals[]   = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+static const uint8_t dcChrominanceBits[] = {0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0};
+static const uint8_t dcChrominanceVals[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+
+static const uint8_t acLuminanceBits[] = {0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d};
+static const uint8_t acLuminanceVals[] = {
+    0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+    0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08, 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+    0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+    0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+    0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+    0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+    0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+    0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+    0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+    0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa
+};
+
+static const uint8_t acChrominanceBits[] = {0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77};
+static const uint8_t acChrominanceVals[] = {
+    0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+    0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+    0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+    0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+    0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+    0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+    0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+    0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+    0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+    0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa
+};
+
+// Write 16-bit big-endian value
+static void write16be(uint8_t *ptr, uint16_t value) {
+    ptr[0] = (uint8_t)((value >> 8) & 0xFFU);
+    ptr[1] = (uint8_t)(value & 0xFFU);
+}
+
+static JPEGContext *getJPEGContext(NVContext *ctx, bool createIfMissing) {
+    if (ctx->codecData == NULL) {
+        if (!createIfMissing) {
+            return NULL;
+        }
+
+        ctx->codecData = calloc(1, sizeof(JPEGContext));
+        if (ctx->codecData == NULL) {
+            LOG("JPEG: Failed to allocate codec context");
+            return NULL;
+        }
+    }
+
+    return (JPEGContext *)ctx->codecData;
+}
+
+static void resetJPEGPictureState(NVContext *ctx) {
+    JPEGContext *jpegCtx = getJPEGContext(ctx, false);
+    if (jpegCtx != NULL) {
+        // VA-API allows IQ/Huffman tables to be reused across pictures, so only
+        // clear the per-picture frame header state here.
+        memset(&jpegCtx->picParams, 0, sizeof(jpegCtx->picParams));
+        jpegCtx->hasPicParams = false;
+    }
+
+    ctx->lastSliceParams = NULL;
+    ctx->lastSliceParamsCount = 0U;
+}
+
+static bool componentInFrame(const VAPictureParameterBufferJPEGBaseline *pic, uint8_t componentId) {
+    for (uint32_t i = 0; i < pic->num_components; i++) {
+        if (pic->components[i].component_id == componentId) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static bool getUsedQuantTablesMask(const VAPictureParameterBufferJPEGBaseline *pic, uint8_t *outMask) {
+    uint8_t usedMask = 0;
+
+    for (uint32_t i = 0; i < pic->num_components; i++) {
+        uint8_t tableId = pic->components[i].quantiser_table_selector;
+        if (tableId >= 4U) {
+            LOG("JPEG: Invalid quantiser table selector %u", tableId);
+            return false;
+        }
+
+        usedMask |= (uint8_t)(1U << tableId);
+    }
+
+    *outMask = usedMask;
+    return true;
+}
+
+// Write DQT (Define Quantization Table) segments for all tables used by this frame.
+static bool writeDQT(uint8_t **pptr,
+                     const VAIQMatrixBufferJPEGBaseline *iq,
+                     const VAPictureParameterBufferJPEGBaseline *pic,
+                     uint8_t validMask) {
+    uint8_t usedMask = 0;
+    uint8_t *ptr = *pptr;
+
+    if (!getUsedQuantTablesMask(pic, &usedMask)) {
+        return false;
+    }
+
+    if ((validMask & usedMask) != usedMask) {
+        LOG("JPEG: Missing quant tables (used=0x%02x, valid=0x%02x)", usedMask, validMask);
+        return false;
+    }
+
+    for (uint32_t table = 0; table < 4U; table++) {
+        uint8_t bit = (uint8_t)(1U << table);
+        if ((usedMask & bit) == 0) {
+            continue;
+        }
+
+        uint32_t sum = 0;
+        for (uint32_t j = 0; j < 64U; j++) {
+            sum += iq->quantiser_table[table][j];
+        }
+        if (sum == 0U) {
+            LOG("JPEG: Quant table %u is empty", table);
+            return false;
+        }
+
+        *ptr++ = JPEG_MARKER;
+        *ptr++ = JPEG_DQT;
+        write16be(ptr, 67U); // Length (2 + 1 + 64)
+        ptr += 2;
+        *ptr++ = (uint8_t)table;
+        memcpy(ptr, iq->quantiser_table[table], 64U);
+        ptr += 64;
+    }
+
+    *pptr = ptr;
+    return true;
+}
+
+// Write DRI (Define Restart Interval) segment
+static uint8_t *writeDRI(uint8_t *ptr, uint16_t restartInterval) {
+    *ptr++ = JPEG_MARKER;
+    *ptr++ = JPEG_DRI;
+    write16be(ptr, 4U); // Length (2 + 2 bytes interval)
+    ptr += 2;
+    write16be(ptr, restartInterval);
+    ptr += 2;
+    return ptr;
+}
+
+// Write SOF0 (Start of Frame) segment
+static uint8_t *writeSOF0(uint8_t *ptr, const VAPictureParameterBufferJPEGBaseline *pic) {
+    *ptr++ = JPEG_MARKER;
+    *ptr++ = JPEG_SOF0;
+
+    uint16_t length = (uint16_t)(2U + 1U + 2U + 2U + 1U + ((uint16_t)pic->num_components * 3U));
+    write16be(ptr, length);
+    ptr += 2;
+
+    *ptr++ = 8U; // Precision (8 bits)
+    write16be(ptr, pic->picture_height);
+    ptr += 2;
+    write16be(ptr, pic->picture_width);
+    ptr += 2;
+    *ptr++ = pic->num_components;
+
+    for (uint32_t i = 0; i < pic->num_components; i++) {
+        *ptr++ = pic->components[i].component_id;
+        *ptr++ = (uint8_t)((pic->components[i].h_sampling_factor << 4) |
+                           pic->components[i].v_sampling_factor);
+        *ptr++ = pic->components[i].quantiser_table_selector;
+    }
+
+    return ptr;
+}
+
+// Write DHT (Define Huffman Table) segment
+static uint8_t *writeDHT(uint8_t *ptr,
+                         const uint8_t *bits,
+                         const uint8_t *vals,
+                         uint8_t tableClass,
+                         uint8_t tableId,
+                         uint32_t numVals) {
+    *ptr++ = JPEG_MARKER;
+    *ptr++ = JPEG_DHT;
+
+    uint16_t length = (uint16_t)(2U + 1U + 16U + numVals);
+    write16be(ptr, length);
+    ptr += 2;
+
+    *ptr++ = (uint8_t)((tableClass << 4) | tableId);
+    memcpy(ptr, bits, 16U);
+    ptr += 16;
+    memcpy(ptr, vals, numVals);
+    ptr += numVals;
+
+    return ptr;
+}
+
+// Write standard Huffman tables
+static uint8_t *writeStandardHuffmanTables(uint8_t *ptr) {
+    ptr = writeDHT(ptr, dcLuminanceBits, dcLuminanceVals, 0U, 0U, 12U);
+    ptr = writeDHT(ptr, dcChrominanceBits, dcChrominanceVals, 0U, 1U, 12U);
+    ptr = writeDHT(ptr, acLuminanceBits, acLuminanceVals, 1U, 0U, 162U);
+    ptr = writeDHT(ptr, acChrominanceBits, acChrominanceVals, 1U, 1U, 162U);
+    return ptr;
+}
+
+static bool countHuffValues(const uint8_t codes[16], uint32_t maxVals, uint32_t *outCount) {
+    uint32_t sum = 0;
+    for (uint32_t i = 0; i < 16U; i++) {
+        sum += codes[i];
+        if (sum > maxVals) {
+            return false;
+        }
+    }
+
+    *outCount = sum;
+    return true;
+}
+
+// Write Huffman tables from VA-API. Returns false if tables are invalid/out-of-range.
+static bool writeVAHuffmanTables(uint8_t **pptr,
+                                 const VAHuffmanTableBufferJPEGBaseline *huffman,
+                                 uint8_t requiredDcMask,
+                                 uint8_t requiredAcMask,
+                                 uint8_t validDcMask,
+                                 uint8_t validAcMask) {
+    uint8_t *ptr = *pptr;
+
+    for (uint32_t tableIdx = 0; tableIdx < 2U; tableIdx++) {
+        uint8_t bit = (uint8_t)(1U << tableIdx);
+
+        if ((requiredDcMask & bit) != 0U) {
+            if ((validDcMask & bit) == 0U) {
+                LOG("JPEG: Missing required DC Huffman table %u", tableIdx);
+                return false;
+            }
+
+            uint32_t numDcValues = 0;
+            if (!countHuffValues(huffman->huffman_table[tableIdx].num_dc_codes, 12U, &numDcValues) ||
+                numDcValues == 0U) {
+                LOG("JPEG: Invalid DC Huffman table %u (count=%u)", tableIdx, numDcValues);
+                return false;
+            }
+
+            ptr = writeDHT(ptr,
+                           huffman->huffman_table[tableIdx].num_dc_codes,
+                           huffman->huffman_table[tableIdx].dc_values,
+                           0U,
+                           (uint8_t)tableIdx,
+                           numDcValues);
+        }
+
+        if ((requiredAcMask & bit) != 0U) {
+            if ((validAcMask & bit) == 0U) {
+                LOG("JPEG: Missing required AC Huffman table %u", tableIdx);
+                return false;
+            }
+
+            uint32_t numAcValues = 0;
+            if (!countHuffValues(huffman->huffman_table[tableIdx].num_ac_codes, 162U, &numAcValues) ||
+                numAcValues == 0U) {
+                LOG("JPEG: Invalid AC Huffman table %u (count=%u)", tableIdx, numAcValues);
+                return false;
+            }
+
+            ptr = writeDHT(ptr,
+                           huffman->huffman_table[tableIdx].num_ac_codes,
+                           huffman->huffman_table[tableIdx].ac_values,
+                           1U,
+                           (uint8_t)tableIdx,
+                           numAcValues);
+        }
+    }
+
+    *pptr = ptr;
+    return true;
+}
+
+static bool validateSliceAndCollectHuffmanUsage(const VAPictureParameterBufferJPEGBaseline *pic,
+                                                const VASliceParameterBufferJPEGBaseline *slice,
+                                                uint8_t *requiredDcMask,
+                                                uint8_t *requiredAcMask) {
+    if (slice->num_components == 0U || slice->num_components > JPEG_MAX_COMPONENTS) {
+        LOG("JPEG: Unsupported scan component count: %u", slice->num_components);
+        return false;
+    }
+
+    for (uint32_t i = 0; i < slice->num_components; i++) {
+        uint8_t componentSelector = slice->components[i].component_selector;
+        uint8_t dcSelector = slice->components[i].dc_table_selector;
+        uint8_t acSelector = slice->components[i].ac_table_selector;
+
+        if (!componentInFrame(pic, componentSelector)) {
+            LOG("JPEG: Scan references unknown frame component id %u", componentSelector);
+            return false;
+        }
+
+        if (dcSelector > 1U || acSelector > 1U) {
+            LOG("JPEG: Huffman selector out of range (dc=%u ac=%u)", dcSelector, acSelector);
+            return false;
+        }
+
+        *requiredDcMask |= (uint8_t)(1U << dcSelector);
+        *requiredAcMask |= (uint8_t)(1U << acSelector);
+    }
+
+    return true;
+}
+
+// Write SOS (Start of Scan) segment
+static uint8_t *writeSOS(uint8_t *ptr, const VASliceParameterBufferJPEGBaseline *slice) {
+    *ptr++ = JPEG_MARKER;
+    *ptr++ = JPEG_SOS;
+
+    uint16_t length = (uint16_t)(2U + 1U + ((uint16_t)slice->num_components * 2U) + 3U);
+    write16be(ptr, length);
+    ptr += 2;
+
+    *ptr++ = slice->num_components;
+
+    for (uint32_t i = 0; i < slice->num_components; i++) {
+        *ptr++ = slice->components[i].component_selector;
+        *ptr++ = (uint8_t)((slice->components[i].dc_table_selector << 4) |
+                           slice->components[i].ac_table_selector);
+    }
+
+    *ptr++ = 0U;   // Ss (start of spectral selection)
+    *ptr++ = 63U;  // Se (end of spectral selection)
+    *ptr++ = 0U;   // Ah/Al (successive approximation)
 
-static void copyJPEGPicParam(NVContext *ctx, NVBuffer* buffer, CUVIDPICPARAMS *picParams)
+    return ptr;
+}
+
+// Reconstruct complete JPEG frame
+static uint8_t *reconstructJPEG(const JPEGContext *jpegCtx,
+                                const VASliceParameterBufferJPEGBaseline *slices,
+                                uint32_t sliceCount,
+                                const uint8_t *sliceData,
+                                uint32_t sliceDataSize,
+                                uint32_t *outSize) {
+    if (!jpegCtx->hasPicParams || !jpegCtx->hasIQMatrix) {
+        LOG("JPEG: Missing picture params or IQ matrix");
+        return NULL;
+    }
+
+    if (jpegCtx->picParams.picture_width == 0U || jpegCtx->picParams.picture_height == 0U) {
+        LOG("JPEG: Invalid dimensions: %ux%u",
+            jpegCtx->picParams.picture_width,
+            jpegCtx->picParams.picture_height);
+        return NULL;
+    }
+
+    if (jpegCtx->picParams.num_components == 0U || jpegCtx->picParams.num_components > JPEG_MAX_COMPONENTS) {
+        LOG("JPEG: Unsupported frame component count: %u", jpegCtx->picParams.num_components);
+        return NULL;
+    }
+
+    uint8_t usedQuantMask = 0;
+    if (!getUsedQuantTablesMask(&jpegCtx->picParams, &usedQuantMask)) {
+        return NULL;
+    }
+
+    if ((jpegCtx->validQuantTablesMask & usedQuantMask) != usedQuantMask) {
+        LOG("JPEG: Missing required quant tables (used=0x%02x valid=0x%02x)",
+            usedQuantMask,
+            jpegCtx->validQuantTablesMask);
+        return NULL;
+    }
+
+    if (sliceCount == 0U) {
+        LOG("JPEG: No slice parameters");
+        return NULL;
+    }
+
+    uint64_t totalEcsSize = 0;
+    uint8_t requiredDcMask = 0;
+    uint8_t requiredAcMask = 0;
+
+    for (uint32_t i = 0; i < sliceCount; i++) {
+        const VASliceParameterBufferJPEGBaseline *slice = &slices[i];
+
+        if (slice->slice_data_flag != VA_SLICE_DATA_FLAG_ALL) {
+            LOG("JPEG: slice_data_flag=%u not supported (expected ALL)", slice->slice_data_flag);
+            return NULL;
+        }
+
+        if (!validateSliceAndCollectHuffmanUsage(&jpegCtx->picParams,
+                                                 slice,
+                                                 &requiredDcMask,
+                                                 &requiredAcMask)) {
+            return NULL;
+        }
+
+        if (slice->slice_data_offset > sliceDataSize) {
+            LOG("JPEG: Invalid slice_data_offset (%u) exceeds buffer size (%u)",
+                slice->slice_data_offset,
+                sliceDataSize);
+            return NULL;
+        }
+
+        uint32_t availableData = sliceDataSize - slice->slice_data_offset;
+        if (slice->slice_data_size > availableData) {
+            LOG("JPEG: Invalid slice_data_size (%u) exceeds available data (%u)",
+                slice->slice_data_size,
+                availableData);
+            return NULL;
+        }
+
+        if (UINT64_MAX - totalEcsSize < slice->slice_data_size) {
+            LOG("JPEG: Total ECS size overflow");
+            return NULL;
+        }
+        totalEcsSize += slice->slice_data_size;
+    }
+
+    const VASliceParameterBufferJPEGBaseline *slice0 = &slices[0];
+
+    bool allSameSOSHeader = true;
+    bool allSameRestartInterval = true;
+
+    for (uint32_t i = 1; i < sliceCount; i++) {
+        const VASliceParameterBufferJPEGBaseline *slice = &slices[i];
+
+        if (slice->restart_interval != slice0->restart_interval) {
+            allSameRestartInterval = false;
+        }
+
+        if (slice->num_components != slice0->num_components) {
+            allSameSOSHeader = false;
+            continue;
+        }
+
+        for (uint32_t c = 0; c < slice->num_components; c++) {
+            if (slice->components[c].component_selector != slice0->components[c].component_selector ||
+                slice->components[c].dc_table_selector != slice0->components[c].dc_table_selector ||
+                slice->components[c].ac_table_selector != slice0->components[c].ac_table_selector) {
+                allSameSOSHeader = false;
+                break;
+            }
+        }
+    }
+
+    // Worst-case buffer size calculation (overestimate for safety)
+    const uint64_t dqtSize = 4U * (2U + 2U + 1U + 64U);
+    const uint64_t sof0Size = 2U + 2U + 1U + 2U + 2U + 1U +
+                              ((uint64_t)jpegCtx->picParams.num_components * 3U);
+    const uint64_t dhtSize = 2U * ((2U + 2U + 1U + 16U + 12U) +
+                                   (2U + 2U + 1U + 16U + 162U));
+    const uint64_t driSize = (uint64_t)sliceCount * (2U + 2U + 2U);
+    const uint64_t sosSize = (uint64_t)sliceCount * (2U + 2U + 1U + 4U * 2U + 3U);
+
+    uint64_t maxSize64 = sizeof(jfifHeader) + dqtSize + sof0Size + dhtSize +
+                         driSize + sosSize + totalEcsSize + 2U;
+
+    if (maxSize64 > SIZE_MAX) {
+        LOG("JPEG: Frame size too large to allocate (%llu bytes)", (unsigned long long)maxSize64);
+        return NULL;
+    }
+
+    uint8_t *frame = (uint8_t *)malloc((size_t)maxSize64);
+    if (frame == NULL) {
+        LOG("JPEG: Failed to allocate frame buffer");
+        return NULL;
+    }
+
+    uint8_t *ptr = frame;
+
+    // 1. SOI + JFIF header
+    memcpy(ptr, jfifHeader, sizeof(jfifHeader));
+    ptr += sizeof(jfifHeader);
+
+    // 2. DQT
+    if (!writeDQT(&ptr, &jpegCtx->iqMatrix, &jpegCtx->picParams, jpegCtx->validQuantTablesMask)) {
+        free(frame);
+        return NULL;
+    }
+
+    // 3. SOF0
+    ptr = writeSOF0(ptr, &jpegCtx->picParams);
+
+    // 4. DHT (VA tables if complete and valid, else standard)
+    bool useVAHuffman = jpegCtx->hasHuffmanTable &&
+                        ((jpegCtx->validHuffmanDcMask & requiredDcMask) == requiredDcMask) &&
+                        ((jpegCtx->validHuffmanAcMask & requiredAcMask) == requiredAcMask);
+
+    if (useVAHuffman) {
+        uint8_t *tmp = ptr;
+        if (writeVAHuffmanTables(&tmp,
+                                 &jpegCtx->huffmanTable,
+                                 requiredDcMask,
+                                 requiredAcMask,
+                                 jpegCtx->validHuffmanDcMask,
+                                 jpegCtx->validHuffmanAcMask)) {
+            ptr = tmp;
+        } else {
+            ptr = writeStandardHuffmanTables(ptr);
+        }
+    } else {
+        ptr = writeStandardHuffmanTables(ptr);
+    }
+
+    // 4b. DRI (Restart interval) once if consistent across slices
+    if (allSameRestartInterval && slice0->restart_interval != 0U) {
+        ptr = writeDRI(ptr, slice0->restart_interval);
+    }
+
+    // 5/6. Scan(s)
+    if (allSameSOSHeader) {
+        ptr = writeSOS(ptr, slice0);
+        for (uint32_t i = 0; i < sliceCount; i++) {
+            const VASliceParameterBufferJPEGBaseline *slice = &slices[i];
+            memcpy(ptr, sliceData + slice->slice_data_offset, slice->slice_data_size);
+            ptr += slice->slice_data_size;
+        }
+    } else {
+        for (uint32_t i = 0; i < sliceCount; i++) {
+            const VASliceParameterBufferJPEGBaseline *slice = &slices[i];
+
+            // If restart_interval wasn't consistent globally, emit per-scan DRI.
+            if (!allSameRestartInterval && slice->restart_interval != 0U) {
+                ptr = writeDRI(ptr, slice->restart_interval);
+            }
+
+            ptr = writeSOS(ptr, slice);
+            memcpy(ptr, sliceData + slice->slice_data_offset, slice->slice_data_size);
+            ptr += slice->slice_data_size;
+        }
+    }
+
+    // 7. EOI (avoid duplicating if client already included it)
+    if (!(ptr - frame >= 2 && ptr[-2] == JPEG_MARKER && ptr[-1] == JPEG_EOI)) {
+        *ptr++ = JPEG_MARKER;
+        *ptr++ = JPEG_EOI;
+    }
+
+    uint64_t frameSize64 = (uint64_t)(ptr - frame);
+    if (frameSize64 > UINT32_MAX) {
+        LOG("JPEG: Reconstructed frame too large (%llu bytes)", (unsigned long long)frameSize64);
+        free(frame);
+        return NULL;
+    }
+
+    *outSize = (uint32_t)frameSize64;
+    return frame;
+}
+
+static void copyJPEGPicParam(NVContext *ctx, NVBuffer *buffer, CUVIDPICPARAMS *picParams)
 {
-    VAPictureParameterBufferJPEGBaseline* buf = (VAPictureParameterBufferJPEGBaseline*) buffer->ptr;
+    VAPictureParameterBufferJPEGBaseline *buf = (VAPictureParameterBufferJPEGBaseline *)buffer->ptr;
+    JPEGContext *jpegCtx = getJPEGContext(ctx, true);
 
-    picParams->PicWidthInMbs = (int) ( buf->picture_width + 15) / 16; //int
-    picParams->FrameHeightInMbs = (int) ( buf->picture_height + 15) / 16; //int
+    if (jpegCtx != NULL) {
+        memcpy(&jpegCtx->picParams, buf, sizeof(VAPictureParameterBufferJPEGBaseline));
+        jpegCtx->hasPicParams = buf->picture_width != 0U &&
+                                buf->picture_height != 0U &&
+                                buf->num_components > 0U &&
+                                buf->num_components <= JPEG_MAX_COMPONENTS;
+    }
 
-    picParams->field_pic_flag    = 0;
+    picParams->PicWidthInMbs = (int)((buf->picture_width + 15U) / 16U);
+    picParams->FrameHeightInMbs = (int)((buf->picture_height + 15U) / 16U);
+    picParams->field_pic_flag = 0;
     picParams->bottom_field_flag = 0;
-    picParams->second_field      = 0;
+    picParams->second_field = 0;
+    picParams->intra_pic_flag = 1;
+    picParams->ref_pic_flag = 0;
+}
+
+static void copyJPEGIQMatrix(NVContext *ctx, NVBuffer *buffer, CUVIDPICPARAMS *picParams)
+{
+    VAIQMatrixBufferJPEGBaseline *buf = (VAIQMatrixBufferJPEGBaseline *)buffer->ptr;
+    JPEGContext *jpegCtx = getJPEGContext(ctx, true);
+
+    if (jpegCtx == NULL) {
+        return;
+    }
+
+    for (uint32_t table = 0; table < 4U; table++) {
+        if (buf->load_quantiser_table[table] != 0U) {
+            memcpy(jpegCtx->iqMatrix.quantiser_table[table],
+                   buf->quantiser_table[table],
+                   sizeof(jpegCtx->iqMatrix.quantiser_table[table]));
+            jpegCtx->validQuantTablesMask |= (uint8_t)(1U << table);
+        }
+    }
+
+    jpegCtx->hasIQMatrix = jpegCtx->validQuantTablesMask != 0U;
+    (void)picParams;
+}
+
+static void copyJPEGHuffmanTable(NVContext *ctx, NVBuffer *buffer, CUVIDPICPARAMS *picParams)
+{
+    VAHuffmanTableBufferJPEGBaseline *buf = (VAHuffmanTableBufferJPEGBaseline *)buffer->ptr;
+    JPEGContext *jpegCtx = getJPEGContext(ctx, true);
 
-    picParams->intra_pic_flag    = 1;
-    picParams->ref_pic_flag      = 0;
+    if (jpegCtx == NULL) {
+        return;
+    }
+
+    for (uint32_t table = 0; table < 2U; table++) {
+        if (buf->load_huffman_table[table] != 0U) {
+            memcpy(jpegCtx->huffmanTable.huffman_table[table].num_dc_codes,
+                   buf->huffman_table[table].num_dc_codes,
+                   sizeof(jpegCtx->huffmanTable.huffman_table[table].num_dc_codes));
+            memcpy(jpegCtx->huffmanTable.huffman_table[table].dc_values,
+                   buf->huffman_table[table].dc_values,
+                   sizeof(jpegCtx->huffmanTable.huffman_table[table].dc_values));
+            memcpy(jpegCtx->huffmanTable.huffman_table[table].num_ac_codes,
+                   buf->huffman_table[table].num_ac_codes,
+                   sizeof(jpegCtx->huffmanTable.huffman_table[table].num_ac_codes));
+            memcpy(jpegCtx->huffmanTable.huffman_table[table].ac_values,
+                   buf->huffman_table[table].ac_values,
+                   sizeof(jpegCtx->huffmanTable.huffman_table[table].ac_values));
+
+            jpegCtx->validHuffmanDcMask |= (uint8_t)(1U << table);
+            jpegCtx->validHuffmanAcMask |= (uint8_t)(1U << table);
+        }
+    }
+
+    jpegCtx->hasHuffmanTable = (jpegCtx->validHuffmanDcMask | jpegCtx->validHuffmanAcMask) != 0U;
+    (void)picParams;
 }
 
-static void copyJPEGSliceParam(NVContext *ctx, NVBuffer* buf, CUVIDPICPARAMS *picParams)
+static void copyJPEGSliceParam(NVContext *ctx, NVBuffer *buf, CUVIDPICPARAMS *picParams)
 {
     ctx->lastSliceParams = buf->ptr;
     ctx->lastSliceParamsCount = buf->elements;
-
-    picParams->nNumSlices += buf->elements;
+    (void)picParams;
 }
 
-static void copyJPEGSliceData(NVContext *ctx, NVBuffer* buf, CUVIDPICPARAMS *picParams)
+static void copyJPEGSliceData(NVContext *ctx, NVBuffer *buf, CUVIDPICPARAMS *picParams)
 {
-    for (unsigned int i = 0; i < ctx->lastSliceParamsCount; i++)
-    {
-        VASliceParameterBufferJPEGBaseline *sliceParams = &((VASliceParameterBufferJPEGBaseline*) ctx->lastSliceParams)[i];
-        uint32_t offset = (uint32_t) ctx->bitstreamBuffer.size;
-        appendBuffer(&ctx->sliceOffsets, &offset, sizeof(offset));
-        appendBuffer(&ctx->bitstreamBuffer, PTROFF(buf->ptr, sliceParams->slice_data_offset), sliceParams->slice_data_size);
-        picParams->nBitstreamDataLen += sliceParams->slice_data_size;
+    JPEGContext *jpegCtx = getJPEGContext(ctx, false);
+    if (jpegCtx == NULL) {
+        LOG("JPEG: No codec context available");
+        return;
+    }
+
+    if (ctx->lastSliceParams == NULL || ctx->lastSliceParamsCount == 0U) {
+        LOG("JPEG: No slice parameters available");
+        return;
+    }
+
+    const VASliceParameterBufferJPEGBaseline *slices =
+        (const VASliceParameterBufferJPEGBaseline *)ctx->lastSliceParams;
+
+    LOG("JPEG: Processing %u slice(s), input size %zu bytes",
+        ctx->lastSliceParamsCount,
+        buf->size);
+
+    if (buf->size > UINT32_MAX) {
+        LOG("JPEG: Slice data too large (%zu bytes)", buf->size);
+        return;
+    }
+
+    uint32_t frameSize = 0;
+    uint8_t *frame = reconstructJPEG(jpegCtx,
+                                     slices,
+                                     ctx->lastSliceParamsCount,
+                                     (const uint8_t *)buf->ptr,
+                                     (uint32_t)buf->size,
+                                     &frameSize);
+    if (frame == NULL) {
+        LOG("JPEG: Failed to reconstruct JPEG frame");
+        return;
     }
+
+    if (ctx->bitstreamBuffer.size > UINT32_MAX - frameSize) {
+        LOG("JPEG: Reconstructed bitstream would overflow CUVID limit");
+        free(frame);
+        return;
+    }
+
+    // NVDEC can consume a full JPEG as a single "slice" (same approach as FFmpeg's mjpeg_nvdec)
+    picParams->nNumSlices = 1U;
+
+    uint32_t offset = (uint32_t)ctx->bitstreamBuffer.size;
+    appendBuffer(&ctx->sliceOffsets, &offset, sizeof(offset));
+    appendBuffer(&ctx->bitstreamBuffer, frame, frameSize);
+    picParams->nBitstreamDataLen = (uint32_t)ctx->bitstreamBuffer.size;
+
+    LOG("JPEG: Reconstructed %u bytes for NVDEC", frameSize);
+
+    free(frame);
 }
 
 static cudaVideoCodec computeJPEGCudaCodec(VAProfile profile) {
@@ -51,13 +755,20 @@ static const VAProfile jpegSupportedProfiles[] = {
     VAProfileJPEGBaseline,
 };
 
-const DECLARE_DISABLED_CODEC(jpegCodec) = {
+static void jpegBeginPicture(NVContext *ctx) {
+    resetJPEGPictureState(ctx);
+}
+
+const DECLARE_CODEC(jpegCodec) = {
     .computeCudaCodec = computeJPEGCudaCodec,
     .handlers = {
         [VAPictureParameterBufferType] = copyJPEGPicParam,
+        [VAIQMatrixBufferType] = copyJPEGIQMatrix,
+        [VAHuffmanTableBufferType] = copyJPEGHuffmanTable,
         [VASliceParameterBufferType] = copyJPEGSliceParam,
         [VASliceDataBufferType] = copyJPEGSliceData,
     },
     .supportedProfileCount = ARRAY_SIZE(jpegSupportedProfiles),
     .supportedProfiles = jpegSupportedProfiles,
+    .beginPicture = jpegBeginPicture,
 };
diff --git a/src/vabackend.c b/src/vabackend.c
index c7c7a758..b5518ede 100644
--- a/src/vabackend.c
+++ b/src/vabackend.c
@@ -330,6 +330,9 @@ static bool destroyContext(NVDriver *drv, NVContext *nvCtx) {
     int ret = pthread_timedjoin_np(nvCtx->resolveThread, NULL, &timeout);
     LOG("Finished waiting for resolve thread with %d", ret);
 
+    free(nvCtx->codecData);
+    nvCtx->codecData = NULL;
+
     freeBuffer(&nvCtx->sliceOffsets);
     freeBuffer(&nvCtx->bitstreamBuffer);
 
@@ -1342,6 +1345,9 @@ static VAStatus nvBeginPicture(
     nvCtx->renderTarget = surface;
     nvCtx->renderTarget->progressiveFrame = true; //assume we're producing progressive frame unless the codec says otherwise
     nvCtx->pPicParams.CurrPicIdx = nvCtx->renderTarget->pictureIdx;
+    if (nvCtx->codec != NULL && nvCtx->codec->beginPicture != NULL) {
+        nvCtx->codec->beginPicture(nvCtx);
+    }
 
     return VA_STATUS_SUCCESS;
 }
diff --git a/src/vabackend.h b/src/vabackend.h
index 568cab07..672c489f 100644
--- a/src/vabackend.h
+++ b/src/vabackend.h
@@ -167,6 +167,7 @@ typedef struct _NVContext
     uint32_t            height;
     CUvideodecoder      decoder;
     NVSurface           *renderTarget;
+    void                *codecData;
     void                *lastSliceParams;
     unsigned int        lastSliceParamsCount;
     AppendableBuffer    bitstreamBuffer;
@@ -198,6 +199,7 @@ typedef struct
 
 typedef void (*HandlerFunc)(NVContext*, NVBuffer* , CUVIDPICPARAMS*);
 typedef cudaVideoCodec (*ComputeCudaCodec)(VAProfile);
+typedef void (*CodecBeginPictureFunc)(NVContext*);
 
 //padding/alignment is very important to this structure as it's placed in it's own section
 //in the executable.
@@ -206,6 +208,7 @@ struct _NVCodec {
     HandlerFunc         handlers[VABufferTypeMax];
     int                 supportedProfileCount;
     const VAProfile     *supportedProfiles;
+    CodecBeginPictureFunc beginPicture;
 };
 
 typedef struct _NVCodec NVCodec;