diff --git a/Cargo.lock b/Cargo.lock index 596bdeb..c1ceb71 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -14,6 +14,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bytemuck" +version = "1.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72957246c41db82b8ef88a5486143830adeb8227ef9837740bdec67724cf2c5b" + [[package]] name = "h263-rs" version = "0.1.0" @@ -28,7 +34,8 @@ dependencies = [ name = "h263-rs-yuv" version = "0.1.0" dependencies = [ - "lazy_static", + "bytemuck", + "wide", ] [[package]] @@ -64,6 +71,15 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "safe_arch" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "794821e4ccb0d9f979512f9c1973480123f9bd62a90d74ab0f9426fcf8f4a529" +dependencies = [ + "bytemuck", +] + [[package]] name = "syn" version = "1.0.75" @@ -100,3 +116,13 @@ name = "unicode-xid" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "wide" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "476da2f1d225632b1fffe638ff979a4bc03907e29b0ab596efca7624014f8b62" +dependencies = [ + "bytemuck", + "safe_arch", +] diff --git a/yuv/Cargo.toml b/yuv/Cargo.toml index 259c893..85abb84 100644 --- a/yuv/Cargo.toml +++ b/yuv/Cargo.toml @@ -6,4 +6,5 @@ edition = "2018" license = "MIT OR Apache-2.0" [dependencies] -lazy_static = "1.4.0" \ No newline at end of file +wide = "0.7.3" +bytemuck = "1.7.2" \ No newline at end of file diff --git a/yuv/src/bt601.rs b/yuv/src/bt601.rs index 244e3ce..d3b9c70 100644 --- a/yuv/src/bt601.rs +++ b/yuv/src/bt601.rs @@ -1,96 +1,93 @@ //! YUV-to-RGB decode -use lazy_static::lazy_static; - -/// Precomputes and stores the linear functions for converting YUV (YCb'Cr' to be precise) -/// colors to RGB (sRGB-like, with gamma) colors, in signed 12.4 fixed-point integer format. -/// -/// Since the incoming components are u8, and there is only ever at most 3 of them added -/// at once (when computing the G channel), only about 10 bits would be used if they were -/// u8 - so to get some more precision (and reduce potential stepping artifacts), might -/// as well use about 14 of the 15 (not counting the sign bit) available in i16. -struct LUTs { - /// the contribution of the Y component into all RGB channels - pub y_to_gray: [i16; 256], - /// the contribution of the V (Cr') component into the R channel - pub cr_to_r: [i16; 256], - /// the contribution of the V (Cr') component into the G channel - pub cr_to_g: [i16; 256], - /// the contribution of the U (Cb') component into the G channel - pub cb_to_g: [i16; 256], - /// the contribution of the U (Cb') component into the B channel - pub cb_to_b: [i16; 256], -} - -impl LUTs { - pub fn new() -> LUTs { - let mut y_to_gray = [0i16; 256]; - let mut cr_to_r = [0i16; 256]; - let mut cr_to_g = [0i16; 256]; - let mut cb_to_g = [0i16; 256]; - let mut cb_to_b = [0i16; 256]; - - // - Y needs to be remapped linearly from 16..235 to 0..255 - // - Cr' and Cb' (a.k.a. V and U) need to be remapped linearly from 16..240 to 0..255, - // then shifted to -128..127, and then scaled by the appropriate coefficients - // - Finally all values are multiplied by 16 (1<<4) to turn them into 12.4 format, and rounded to integer. - - for i in 0..256 { - let f = i as f32; - - // According to Wikipedia, these are the exact values from the - // ITU-R BT.601 standard. See the last group of equations on: - // https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion - let y2gray = (255.0 / 219.0) * (f - 16.0); - let cr2r = (255.0 / 224.0) * 1.402 * (f - 128.0); - let cr2g = -(255.0 / 224.0) * 1.402 * (0.299 / 0.587) * (f - 128.0); - let cb2g = -(255.0 / 224.0) * 1.772 * (0.114 / 0.587) * (f - 128.0); - let cb2b = (255.0 / 224.0) * 1.772 * (f - 128.0); - - // Converting to 12.4 format and rounding before storing - y_to_gray[i] = (y2gray * 16.0).round() as i16; - cr_to_r[i] = (cr2r * 16.0).round() as i16; - cr_to_g[i] = (cr2g * 16.0).round() as i16; - cb_to_g[i] = (cb2g * 16.0).round() as i16; - cb_to_b[i] = (cb2b * 16.0).round() as i16; - } - - LUTs { - y_to_gray, - cr_to_r, - cr_to_g, - cb_to_g, - cb_to_b, - } - } -} +// TODO: Replace with `std::simd` when it's stable +use wide::{i32x4, u8x16}; + +// Operates on 4 pixels at a time, one pixel per SIMD lane, +// with 32 bits of intermediate per-component precision for +// each, so as to fill the 128-bit SIMD registers on WASM. +// And i32x4 also allows the neat transpose trick at the end. +// The output is an interleaved array of 4 RGBA pixels. +#[inline] +fn yuv_to_rgba_4x(yuv: (&[u8; 4], &[u8; 2], &[u8; 2]), rgba: &mut [u8; 16]) { + let (y, cb, cr) = yuv; -lazy_static! { - static ref LUTS: LUTs = LUTs::new(); + // Expanding the 4 bytes into a i32x4, and duplicating chroma samples horizontally. + // The -16 and -128 are simply undoing the offsets in the input representation. + let y = i32x4::from([y[0] as i32, y[1] as i32, y[2] as i32, y[3] as i32]) - i32x4::splat(16); + let cb = + i32x4::from([cb[0] as i32, cb[0] as i32, cb[1] as i32, cb[1] as i32]) - i32x4::splat(128); + let cr = + i32x4::from([cr[0] as i32, cr[0] as i32, cr[1] as i32, cr[1] as i32]) - i32x4::splat(128); + + // The rest of the magic numbers are the coefficients converted to 16.16 fixed point, and rounded. + // They also include the extension from reduced (16..235 and 16...240) to full-range (0..255). + let gray = y * i32x4::splat(76309); // 76309 == round((255.0 / 219.0) * 65536.0) + let cr2r = cr * i32x4::splat(104597); // 104597 == round((255.0 / 224.0) * 1.402 * 65536.0) + let cr2g = cr * i32x4::splat(-53279); // -53279 == round(-(255.0 / 224.0) * 1.402 * (0.299 / 0.587) * 65536.0) + let cb2g = cb * i32x4::splat(-25675); // -25675 == round(-(255.0 / 224.0) * 1.772 * (0.114 / 0.587) * 65536.0) + let cb2b = cb * i32x4::splat(132201); // 132201 == round((255.0 / 224.0) * 1.772 * 65536.0) + + // This is 0.5 in 16.16 format, added to make the rightshift round correctly + let half = i32x4::splat(32768); + + // We could skip the shift here, then simply cast the result into [u8; 16], and take + // bytes 2, 4, 10, 14 instead (after clamping), but it's not any faster, it seems. + let r: i32x4 = (gray + cr2r + half) >> 16; + let g: i32x4 = (gray + cr2g + cb2g + half) >> 16; + let b: i32x4 = (gray + cb2b + half) >> 16; + + // Clamping to the valid output range + // A simple clamp(x, 0, 255) doesn't work, because it seems to + // operate on entire tuples, instead of each element separately. + let max = i32x4::splat(255); + + let r = r.max(i32x4::ZERO).min(max); + let g = g.max(i32x4::ZERO).min(max); + let b = b.max(i32x4::ZERO).min(max); + + // The output alpha values are fixed + let a = i32x4::splat(255); + // Transposing the separate RGBA components into a single interleaved vector + // Thanks for the tip, Lokathor! + #[cfg(target_endian = "little")] + let rgba_4x = ((r) | (g << 8)) | ((b << 16) | (a << 24)); + #[cfg(target_endian = "big")] // I haven't tested this, but should work + let rgba_4x = ((r << 24) | (g << 16)) | ((b << 8) | (a)); + + rgba.copy_from_slice(bytemuck::cast::(rgba_4x).as_array_ref()) } +// A single-pixel version, only for testing. +#[cfg(test)] #[inline] -fn yuv_to_rgb(yuv: (u8, u8, u8), luts: &LUTs) -> (u8, u8, u8) { - let (y, cb, cr) = yuv; +fn yuv_to_rgb(yuv: (u8, u8, u8)) -> (u8, u8, u8) { + let mut rgba_4x = [0u8; 16]; + yuv_to_rgba_4x( + ( + &[yuv.0, yuv.0, yuv.0, yuv.0], + &[yuv.1, yuv.1], + &[yuv.2, yuv.2], + ), + &mut rgba_4x, + ); - // We rely on the optimizers in rustc/LLVM to eliminate the bounds checks when indexing - // into the fixed 256-long arrays in `luts` with indices coming in as `u8` parameters. - // This is crucial for performance, as this function runs in a fairly tight loop, on all pixels. - // I verified that this is actually happening, see here: https://rust.godbolt.org/z/vWzesYzbq - // And benchmarking showed no time difference from an `unsafe` + `get_unchecked()` solution. - let gray = luts.y_to_gray[y as usize]; - - // The `(... + 8) >> 4` parts convert back from 12.4 fixed-point to `u8` with correct rounding. - // (At least for positive numbers - any negative numbers that might occur will be clamped to 0 anyway.) - let r = (gray + luts.cr_to_r[cr as usize] + 8) >> 4; - let g = (gray + luts.cr_to_g[cr as usize] + luts.cb_to_g[cb as usize] + 8) >> 4; - let b = (gray + luts.cb_to_b[cb as usize] + 8) >> 4; - - ( - r.clamp(0, 255) as u8, - g.clamp(0, 255) as u8, - b.clamp(0, 255) as u8, - ) + // all output pixels should be the same + assert!(rgba_4x[3] == 255); + assert!(rgba_4x[4] == rgba_4x[0]); + assert!(rgba_4x[5] == rgba_4x[1]); + assert!(rgba_4x[6] == rgba_4x[2]); + assert!(rgba_4x[7] == 255); + assert!(rgba_4x[8] == rgba_4x[0]); + assert!(rgba_4x[9] == rgba_4x[1]); + assert!(rgba_4x[10] == rgba_4x[2]); + assert!(rgba_4x[11] == 255); + assert!(rgba_4x[12] == rgba_4x[0]); + assert!(rgba_4x[13] == rgba_4x[1]); + assert!(rgba_4x[14] == rgba_4x[2]); + assert!(rgba_4x[15] == 255); + + (rgba_4x[0] as u8, rgba_4x[1] as u8, rgba_4x[2] as u8) } /// Convert planar YUV 4:2:0 data into interleaved RGBA 8888 data. @@ -107,6 +104,7 @@ fn yuv_to_rgb(yuv: (u8, u8, u8), luts: &LUTs) -> (u8, u8, u8) { /// - `br_width` must be half of `y_width`, rounded up /// - With `y_height` computed as `y.len() / y_width`, and `br_height` as `chroma_b.len() / br_width`: /// `br_height` must be half of `y_height`, rounded up +/// pub fn yuv420_to_rgba( y: &[u8], chroma_b: &[u8], @@ -138,42 +136,67 @@ pub fn yuv420_to_rgba( let mut rgba = vec![0; y.len() * 4]; let rgba_stride = y_width * 4; // 4 bytes per pixel, interleaved - // making sure that the "is it initialized already?" check is only done once per frame by getting a direct reference - let luts: &LUTs = &*LUTS; - // Iteration is done in a row-major order to fit the slice layouts. for luma_rowindex in 0..y_height { let chroma_rowindex = luma_rowindex / 2; - let y_row = &y[luma_rowindex * y_width..(luma_rowindex + 1) * y_width]; - let cb_row = &chroma_b[chroma_rowindex * br_width..(chroma_rowindex + 1) * br_width]; - let cr_row = &chroma_r[chroma_rowindex * br_width..(chroma_rowindex + 1) * br_width]; - let rgba_row = &mut rgba[luma_rowindex * rgba_stride..(luma_rowindex + 1) * rgba_stride]; - - // Iterating on 2 pixels at a time, leaving off the last one if width is odd. - let y_iter = y_row.chunks_exact(2); - let cb_iter = cb_row.iter(); - let cr_iter = cr_row.iter(); - // Similar to how Y is iterated on, but with 4 channels per pixel - let rgba_iter = rgba_row.chunks_exact_mut(8); - - for (((y, cb), cr), rgba) in y_iter.zip(cb_iter).zip(cr_iter).zip(rgba_iter) { - let rgb0 = yuv_to_rgb((y[0], *cb, *cr), luts); - let rgb1 = yuv_to_rgb((y[1], *cb, *cr), luts); - // The output alpha values are fixed - rgba.copy_from_slice(&[rgb0.0, rgb0.1, rgb0.2, 255, rgb1.0, rgb1.1, rgb1.2, 255]); + let y_remainder = y_width % 4; + let br_remainder = br_width % 2; + let rgba_remainder = y_remainder * 4; + + // This block is here just so the mutable borrow of rgba_row expires sooner. + { + // These borrows only include whole chunks of lengths 4 and 2. + let y_row = &y[luma_rowindex * y_width..(luma_rowindex + 1) * y_width - y_remainder]; + let cb_row = &chroma_b + [chroma_rowindex * br_width..(chroma_rowindex + 1) * br_width - br_remainder]; + let cr_row = &chroma_r + [chroma_rowindex * br_width..(chroma_rowindex + 1) * br_width - br_remainder]; + let rgba_row = &mut rgba + [luma_rowindex * rgba_stride..(luma_rowindex + 1) * rgba_stride - rgba_remainder]; + + // TODO: Replace `bytemuck::cast_slice` with `std::slice::array_chunks` when it's stable. + + // Iterating on 4 pixels (in a horizontal row arrangement) at a time, + // leaving off the last few on the right if width is not divisible by 4. + let y_iter = bytemuck::cast_slice::(y_row).iter(); + // We need half as many chroma samples for each iteration + let cb_iter = bytemuck::cast_slice::(cb_row).iter(); + let cr_iter = bytemuck::cast_slice::(cr_row).iter(); + // Similar to how Y is iterated on, but with 4 channels per pixel + let rgba_iter = bytemuck::cast_slice_mut::(rgba_row).iter_mut(); + + for (((y, cb), cr), rgba) in y_iter.zip(cb_iter).zip(cr_iter).zip(rgba_iter) { + yuv_to_rgba_4x((y, cb, cr), rgba); + } } - // On odd wide pictures, the last pixel is not covered by the iteration above, - // but is included in y_row and rgba_row. - if y_width % 2 == 1 { - let y = y_row.last().unwrap(); - let cb = cb_row.last().unwrap(); - let cr = cr_row.last().unwrap(); - - let rgb = yuv_to_rgb((*y, *cb, *cr), luts); - - rgba_row[rgba_stride - 4..rgba_stride].copy_from_slice(&[rgb.0, rgb.1, rgb.2, 255]) + // On pictures with width not divisible by 4, the last few pixels are not + // covered by the iteration above, so doing them here, at once in each row. + if y_remainder != 0 { + // These are the same borrows as above, but with the whole row, not rounded down to multiples of 4 or 2. + let y_row = &y[luma_rowindex * y_width..(luma_rowindex + 1) * y_width]; + let cb_row = &chroma_b[chroma_rowindex * br_width..(chroma_rowindex + 1) * br_width]; + let cr_row = &chroma_r[chroma_rowindex * br_width..(chroma_rowindex + 1) * br_width]; + let rgba_row = + &mut rgba[luma_rowindex * rgba_stride..(luma_rowindex + 1) * rgba_stride]; + + let mut y = [0u8; 4]; + let mut cb = [0u8; 2]; + let mut cr = [0u8; 2]; + + for x in y_width - y_remainder..y_width { + y[x % 4] = y_row[x]; + cb[(x % 4) / 2] = cb_row[x / 2]; + cr[(x % 4) / 2] = cr_row[x / 2]; + } + + let mut rgba_4x = [0u8; 16]; + yuv_to_rgba_4x((&y, &cb, &cr), &mut rgba_4x); + + for i in rgba_stride - rgba_remainder..rgba_stride { + rgba_row[i] = rgba_4x[i % 16]; + } } } @@ -189,24 +212,24 @@ fn test_yuv_to_rgb() { // Peak colour difference = 16 and 240 // not quite black - assert_eq!(yuv_to_rgb((17, 128, 128), &LUTS), (1, 1, 1)); + assert_eq!(yuv_to_rgb((17, 128, 128)), (1, 1, 1)); // exactly black - assert_eq!(yuv_to_rgb((16, 128, 128), &LUTS), (0, 0, 0)); + assert_eq!(yuv_to_rgb((16, 128, 128)), (0, 0, 0)); // and clamping also works - assert_eq!(yuv_to_rgb((15, 128, 128), &LUTS), (0, 0, 0)); - assert_eq!(yuv_to_rgb((0, 128, 128), &LUTS), (0, 0, 0)); + assert_eq!(yuv_to_rgb((15, 128, 128)), (0, 0, 0)); + assert_eq!(yuv_to_rgb((0, 128, 128)), (0, 0, 0)); // not quite white - assert_eq!(yuv_to_rgb((234, 128, 128), &LUTS), (254, 254, 254)); + assert_eq!(yuv_to_rgb((234, 128, 128)), (254, 254, 254)); // exactly white - assert_eq!(yuv_to_rgb((235, 128, 128), &LUTS), (255, 255, 255)); + assert_eq!(yuv_to_rgb((235, 128, 128)), (255, 255, 255)); // and clamping also works - assert_eq!(yuv_to_rgb((236, 128, 128), &LUTS), (255, 255, 255)); - assert_eq!(yuv_to_rgb((255, 128, 128), &LUTS), (255, 255, 255)); + assert_eq!(yuv_to_rgb((236, 128, 128)), (255, 255, 255)); + assert_eq!(yuv_to_rgb((255, 128, 128)), (255, 255, 255)); // (16 + 235) / 2 = 125.5, for middle grays - assert_eq!(yuv_to_rgb((125, 128, 128), &LUTS), (127, 127, 127)); - assert_eq!(yuv_to_rgb((126, 128, 128), &LUTS), (128, 128, 128)); + assert_eq!(yuv_to_rgb((125, 128, 128)), (127, 127, 127)); + assert_eq!(yuv_to_rgb((126, 128, 128)), (128, 128, 128)); } // Inverse conversion, for testing purposes only @@ -257,43 +280,34 @@ fn test_rgb_to_yuv() { #[test] fn test_rgb_yuv_rgb_roundtrip_sanity() { - assert_eq!(yuv_to_rgb(rgb_to_yuv((0, 0, 0)), &LUTS), (0, 0, 0)); - assert_eq!( - yuv_to_rgb(rgb_to_yuv((127, 127, 127)), &LUTS), - (127, 127, 127) - ); - assert_eq!( - yuv_to_rgb(rgb_to_yuv((128, 128, 128)), &LUTS), - (128, 128, 128) - ); - assert_eq!( - yuv_to_rgb(rgb_to_yuv((255, 255, 255)), &LUTS), - (255, 255, 255) - ); + assert_eq!(yuv_to_rgb(rgb_to_yuv((0, 0, 0))), (0, 0, 0)); + assert_eq!(yuv_to_rgb(rgb_to_yuv((127, 127, 127))), (127, 127, 127)); + assert_eq!(yuv_to_rgb(rgb_to_yuv((128, 128, 128))), (128, 128, 128)); + assert_eq!(yuv_to_rgb(rgb_to_yuv((255, 255, 255))), (255, 255, 255)); assert_eq!( - yuv_to_rgb(rgb_to_yuv((255, 0, 0)), &LUTS), + yuv_to_rgb(rgb_to_yuv((255, 0, 0))), (254, 0, 0) // !!! there is a rounding error here ); assert_eq!( - yuv_to_rgb(rgb_to_yuv((0, 255, 0)), &LUTS), + yuv_to_rgb(rgb_to_yuv((0, 255, 0))), (0, 255, 1) // !!! there is a rounding error here ); assert_eq!( - yuv_to_rgb(rgb_to_yuv((0, 0, 255)), &LUTS), + yuv_to_rgb(rgb_to_yuv((0, 0, 255))), (0, 0, 255) // there is NO rounding error here ); assert_eq!( - yuv_to_rgb(rgb_to_yuv((0, 255, 255)), &LUTS), + yuv_to_rgb(rgb_to_yuv((0, 255, 255))), (1, 255, 255) // !!! there is a rounding error here ); assert_eq!( - yuv_to_rgb(rgb_to_yuv((255, 0, 255)), &LUTS), + yuv_to_rgb(rgb_to_yuv((255, 0, 255))), (255, 0, 254) // !!! there is a rounding error here ); assert_eq!( - yuv_to_rgb(rgb_to_yuv((255, 255, 0)), &LUTS), + yuv_to_rgb(rgb_to_yuv((255, 255, 0))), (255, 255, 0) // there is NO rounding error here ); @@ -310,7 +324,7 @@ fn test_rgb_yuv_rgb_roundtrip_sanity() { (188, 189, 34), (23, 190, 207), ] { - let rgb2 = yuv_to_rgb(rgb_to_yuv(rgb), &LUTS); + let rgb2 = yuv_to_rgb(rgb_to_yuv(rgb)); // Allowing for a difference of at most 1 on each component in both directions, // to account for the limited precision in YUV form, and two roundings assert!((rgb.0 as i32 - rgb2.0 as i32).abs() <= 1); @@ -320,7 +334,7 @@ fn test_rgb_yuv_rgb_roundtrip_sanity() { } #[test] -fn test_yuv420_to_rgba() { +fn test_yuv420_to_rgba_tiny() { // empty picture assert_eq!(yuv420_to_rgba(&[], &[], &[], 0, 0), vec![0u8; 0]); @@ -405,5 +419,73 @@ fn test_yuv420_to_rgba() { ); // The middle row/column of pixels use the top/left row/column of chroma samples: - assert_eq!(yuv_to_rgb((125, 90, 240), &LUTS), (255, 51, 50)); + assert_eq!(yuv_to_rgb((125, 90, 240)), (255, 51, 50)); +} + +#[test] +fn test_yuv420_to_rgba_medium() { + // A 4x4 picture, red on the top, green on the bottom. + // This should be done by SIMD now. + #[rustfmt::skip] + assert_eq!( + yuv420_to_rgba( + &[ 81u8, 81u8, 81u8, 81u8, + 81u8, 81u8, 81u8, 81u8, + 145u8, 145u8, 145u8, 145u8, + 145u8, 145u8, 145u8, 145u8], + &[ 90u8, 90u8, + 54u8, 54u8], + &[240u8, 240u8, + 34u8, 34u8], + 4, 2), + vec![ + 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, // red, with rounding error + 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, // red, with rounding error + 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, // green, with rounding error + 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, // green, with rounding error + ] + ); + + // A 5x4 picture, red on the top, green on the bottom. + // This should be done by SIMD now, plus one row of remainder. + #[rustfmt::skip] + assert_eq!( + yuv420_to_rgba( + &[ 81u8, 81u8, 81u8, 81u8, 81u8, + 81u8, 81u8, 81u8, 81u8, 81u8, + 145u8, 145u8, 145u8, 145u8, 145u8, + 145u8, 145u8, 145u8, 145u8, 145u8], + &[ 90u8, 90u8, 90u8, + 54u8, 54u8, 54u8], + &[240u8, 240u8, 240u8, + 34u8, 34u8, 34u8], + 5, 3), + vec![ + 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, + 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, + 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, + 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, + ] + ); + + // Same as before, but the last column is upside down, to check if it uses the right values. + #[rustfmt::skip] + assert_eq!( + yuv420_to_rgba( + &[ 81u8, 81u8, 81u8, 81u8, 145u8, + 81u8, 81u8, 81u8, 81u8, 145u8, + 145u8, 145u8, 145u8, 145u8, 81u8, + 145u8, 145u8, 145u8, 145u8, 81u8], + &[ 90u8, 90u8, 54u8, + 54u8, 54u8, 90u8], + &[240u8, 240u8, 34u8, + 34u8, 34u8, 240u8], + 5, 3), + vec![ + 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 0u8, 255u8, 1u8, 255u8, // red, with rounding error + 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 254u8, 0u8, 0u8, 255u8, 0u8, 255u8, 1u8, 255u8, // red, with rounding error + 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 254u8, 0u8, 0u8, 255u8, // green, with rounding error + 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 0u8, 255u8, 1u8, 255u8, 254u8, 0u8, 0u8, 255u8, // green, with rounding error + ] + ); }