diff --git a/Cargo.lock b/Cargo.lock
index 596bdeb..c1ceb71 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -14,6 +14,12 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
+[[package]]
+name = "bytemuck"
+version = "1.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72957246c41db82b8ef88a5486143830adeb8227ef9837740bdec67724cf2c5b"
+
 [[package]]
 name = "h263-rs"
 version = "0.1.0"
@@ -28,7 +34,8 @@ dependencies = [
 name = "h263-rs-yuv"
 version = "0.1.0"
 dependencies = [
- "lazy_static",
+ "bytemuck",
+ "wide",
 ]
 
 [[package]]
@@ -64,6 +71,15 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "safe_arch"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "794821e4ccb0d9f979512f9c1973480123f9bd62a90d74ab0f9426fcf8f4a529"
+dependencies = [
+ "bytemuck",
+]
+
 [[package]]
 name = "syn"
 version = "1.0.75"
@@ -100,3 +116,13 @@ name = "unicode-xid"
 version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
+
+[[package]]
+name = "wide"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "476da2f1d225632b1fffe638ff979a4bc03907e29b0ab596efca7624014f8b62"
+dependencies = [
+ "bytemuck",
+ "safe_arch",
+]
diff --git a/yuv/Cargo.toml b/yuv/Cargo.toml
index 259c893..85abb84 100644
--- a/yuv/Cargo.toml
+++ b/yuv/Cargo.toml
@@ -6,4 +6,5 @@ edition = "2018"
 license = "MIT OR Apache-2.0"
 
 [dependencies]
-lazy_static = "1.4.0"
\ No newline at end of file
+wide = "0.7.3"
+bytemuck = "1.7.2"
\ No newline at end of file
diff --git a/yuv/src/bt601.rs b/yuv/src/bt601.rs
index 244e3ce..d3b9c70 100644
--- a/yuv/src/bt601.rs
+++ b/yuv/src/bt601.rs
@@ -1,96 +1,93 @@
 //! YUV-to-RGB decode
 
-use lazy_static::lazy_static;
-
-/// Precomputes and stores the linear functions for converting YUV (YCb'Cr' to be precise)
-/// colors to RGB (sRGB-like, with gamma) colors, in signed 12.4 fixed-point integer format.
-///
-/// Since the incoming components are u8, and there is only ever at most 3 of them added
-/// at once (when computing the G channel), only about 10 bits would be used if they were
-/// u8 - so to get some more precision (and reduce potential stepping artifacts), might
-/// as well use about 14 of the 15 (not counting the sign bit) available in i16.
-struct LUTs {
-    /// the contribution of the Y component into all RGB channels
-    pub y_to_gray: [i16; 256],
-    /// the contribution of the V (Cr') component into the R channel
-    pub cr_to_r: [i16; 256],
-    /// the contribution of the V (Cr') component into the G channel
-    pub cr_to_g: [i16; 256],
-    /// the contribution of the U (Cb') component into the G channel
-    pub cb_to_g: [i16; 256],
-    /// the contribution of the U (Cb') component into the B channel
-    pub cb_to_b: [i16; 256],
-}
-
-impl LUTs {
-    pub fn new() -> LUTs {
-        let mut y_to_gray = [0i16; 256];
-        let mut cr_to_r = [0i16; 256];
-        let mut cr_to_g = [0i16; 256];
-        let mut cb_to_g = [0i16; 256];
-        let mut cb_to_b = [0i16; 256];
-
-        // - Y needs to be remapped linearly from 16..235 to 0..255
-        // - Cr' and Cb' (a.k.a. V and U) need to be remapped linearly from 16..240 to 0..255,
-        //     then shifted to -128..127, and then scaled by the appropriate coefficients
-        // - Finally all values are multiplied by 16 (1<<4) to turn them into 12.4 format, and rounded to integer.
-
-        for i in 0..256 {
-            let f = i as f32;
-
-            // According to Wikipedia, these are the exact values from the
-            // ITU-R BT.601 standard. See the last group of equations on:
-            // https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
-            let y2gray = (255.0 / 219.0) * (f - 16.0);
-            let cr2r = (255.0 / 224.0) * 1.402 * (f - 128.0);
-            let cr2g = -(255.0 / 224.0) * 1.402 * (0.299 / 0.587) * (f - 128.0);
-            let cb2g = -(255.0 / 224.0) * 1.772 * (0.114 / 0.587) * (f - 128.0);
-            let cb2b = (255.0 / 224.0) * 1.772 * (f - 128.0);
-
-            // Converting to 12.4 format and rounding before storing
-            y_to_gray[i] = (y2gray * 16.0).round() as i16;
-            cr_to_r[i] = (cr2r * 16.0).round() as i16;
-            cr_to_g[i] = (cr2g * 16.0).round() as i16;
-            cb_to_g[i] = (cb2g * 16.0).round() as i16;
-            cb_to_b[i] = (cb2b * 16.0).round() as i16;
-        }
-
-        LUTs {
-            y_to_gray,
-            cr_to_r,
-            cr_to_g,
-            cb_to_g,
-            cb_to_b,
-        }
-    }
-}
+// TODO: Replace with `std::simd` when it's stable
+use wide::{i32x4, u8x16};
+
+// Operates on 4 pixels at a time, one pixel per SIMD lane,
+// with 32 bits of intermediate per-component precision for
+// each, so as to fill the 128-bit SIMD registers on WASM.
+// And i32x4 also allows the neat transpose trick at the end.
+// The output is an interleaved array of 4 RGBA pixels.
+#[inline]
+fn yuv_to_rgba_4x(yuv: (&[u8; 4], &[u8; 2], &[u8; 2]), rgba: &mut [u8; 16]) {
+    let (y, cb, cr) = yuv;
 
-lazy_static! {
-    static ref LUTS: LUTs = LUTs::new();
+    // Expanding the 4 bytes into a i32x4, and duplicating chroma samples horizontally.
+    // The -16 and -128 are simply undoing the offsets in the input representation.
+    let y = i32x4::from([y[0] as i32, y[1] as i32, y[2] as i32, y[3] as i32]) - i32x4::splat(16);
+    let cb =
+        i32x4::from([cb[0] as i32, cb[0] as i32, cb[1] as i32, cb[1] as i32]) - i32x4::splat(128);
+    let cr =
+        i32x4::from([cr[0] as i32, cr[0] as i32, cr[1] as i32, cr[1] as i32]) - i32x4::splat(128);
+
+    // The rest of the magic numbers are the coefficients converted to 16.16 fixed point, and rounded.
+    // They also include the extension from reduced (16..235 and 16...240) to full-range (0..255).
+    let gray = y * i32x4::splat(76309); // 76309 == round((255.0 / 219.0) * 65536.0)
+    let cr2r = cr * i32x4::splat(104597); // 104597 == round((255.0 / 224.0) * 1.402 * 65536.0)
+    let cr2g = cr * i32x4::splat(-53279); // -53279 == round(-(255.0 / 224.0) * 1.402 * (0.299 / 0.587) * 65536.0)
+    let cb2g = cb * i32x4::splat(-25675); // -25675 == round(-(255.0 / 224.0) * 1.772 * (0.114 / 0.587) * 65536.0)
+    let cb2b = cb * i32x4::splat(132201); // 132201 == round((255.0 / 224.0) * 1.772 * 65536.0)
+
+    // This is 0.5 in 16.16 format, added to make the rightshift round correctly
+    let half = i32x4::splat(32768);
+
+    // We could skip the shift here, then simply cast the result into [u8; 16], and take
+    // bytes 2, 4, 10, 14 instead (after clamping), but it's not any faster, it seems.
+    let r: i32x4 = (gray + cr2r + half) >> 16;
+    let g: i32x4 = (gray + cr2g + cb2g + half) >> 16;
+    let b: i32x4 = (gray + cb2b + half) >> 16;
+
+    // Clamping to the valid output range
+    // A simple clamp(x, 0, 255) doesn't work, because it seems to
+    // operate on entire tuples, instead of each element separately.
+    let max = i32x4::splat(255);
+
+    let r = r.max(i32x4::ZERO).min(max);
+    let g = g.max(i32x4::ZERO).min(max);
+    let b = b.max(i32x4::ZERO).min(max);
+
+    // The output alpha values are fixed
+    let a = i32x4::splat(255);
+    // Transposing the separate RGBA components into a single interleaved vector
+    // Thanks for the tip, Lokathor!
+    #[cfg(target_endian = "little")]
+    let rgba_4x = ((r) | (g << 8)) | ((b << 16) | (a << 24));
+    #[cfg(target_endian = "big")] // I haven't tested this, but should work
+    let rgba_4x = ((r << 24) | (g << 16)) | ((b << 8) | (a));
+
+    rgba.copy_from_slice(bytemuck::cast::<i32x4, u8x16>(rgba_4x).as_array_ref())
 }
 
+// A single-pixel version, only for testing.
+#[cfg(test)]
 #[inline]
-fn yuv_to_rgb(yuv: (u8, u8, u8), luts: &LUTs) -> (u8, u8, u8) {
-    let (y, cb, cr) = yuv;
+fn yuv_to_rgb(yuv: (u8, u8, u8)) -> (u8, u8, u8) {
+    let mut rgba_4x = [0u8; 16];
+    yuv_to_rgba_4x(
+        (
+            &[yuv.0, yuv.0, yuv.0, yuv.0],
+            &[yuv.1, yuv.1],
+            &[yuv.2, yuv.2],
+        ),
+        &mut rgba_4x,
+    );
 
-    // We rely on the optimizers in rustc/LLVM to eliminate the bounds checks when indexing
-    // into the fixed 256-long arrays in `luts` with indices coming in as `u8` parameters.
-    // This is crucial for performance, as this function runs in a fairly tight loop, on all pixels.
-    // I verified that this is actually happening, see here: https://rust.godbolt.org/z/vWzesYzbq
-    // And benchmarking showed no time difference from an `unsafe` + `get_unchecked()` solution.
-    let gray = luts.y_to_gray[y as usize];
-
-    // The `(... + 8) >> 4` parts convert back from 12.4 fixed-point to `u8` with correct rounding.
-    // (At least for positive numbers - any negative numbers that might occur will be clamped to 0 anyway.)
-    let r = (gray + luts.cr_to_r[cr as usize] + 8) >> 4;
-    let g = (gray + luts.cr_to_g[cr as usize] + luts.cb_to_g[cb as usize] + 8) >> 4;
-    let b = (gray + luts.cb_to_b[cb as usize] + 8) >> 4;
-
-    (
-        r.clamp(0, 255) as u8,
-        g.clamp(0, 255) as u8,
-        b.clamp(0, 255) as u8,
-    )
+    // all output pixels should be the same
+    assert!(rgba_4x[3] == 255);
+    assert!(rgba_4x[4] == rgba_4x[0]);
+    assert!(rgba_4x[5] == rgba_4x[1]);
+    assert!(rgba_4x[6] == rgba_4x[2]);
+    assert!(rgba_4x[7] == 255);
+    assert!(rgba_4x[8] == rgba_4x[0]);
+    assert!(rgba_4x[9] == rgba_4x[1]);
+    assert!(rgba_4x[10] == rgba_4x[2]);
+    assert!(rgba_4x[11] == 255);
+    assert!(rgba_4x[12] == rgba_4x[0]);
+    assert!(rgba_4x[13] == rgba_4x[1]);
+    assert!(rgba_4x[14] == rgba_4x[2]);
+    assert!(rgba_4x[15] == 255);
+
+    (rgba_4x[0] as u8, rgba_4x[1] as u8, rgba_4x[2] as u8)
 }
 
 /// Convert planar YUV 4:2:0 data into interleaved RGBA 8888 data.
@@ -107,6 +104,7 @@ fn yuv_to_rgb(yuv: (u8, u8, u8), luts: &LUTs) -> (u8, u8, u8) {
 ///  - `br_width` must be half of `y_width`, rounded up
 ///  - With `y_height` computed as `y.len() / y_width`, and `br_height` as `chroma_b.len() / br_width`:
 ///    `br_height` must be half of `y_height`, rounded up
+///
 pub fn yuv420_to_rgba(
     y: &[u8],
     chroma_b: &[u8],
@@ -138,42 +136,67 @@ pub fn yuv420_to_rgba(
     let mut rgba = vec![0; y.len() * 4];
     let rgba_stride = y_width * 4; // 4 bytes per pixel, interleaved
 
-    // making sure that the "is it initialized already?" check is only done once per frame by getting a direct reference
-    let luts: &LUTs = &*LUTS;
-
     // Iteration is done in a row-major order to fit the slice layouts.
     for luma_rowindex in 0..y_height {
         let chroma_rowindex = luma_rowindex / 2;
 
-        let y_row = &y[luma_rowindex * y_width..(luma_rowindex + 1) * y_width];
-        let cb_row = &chroma_b[chroma_rowindex * br_width..(chroma_rowindex + 1) * br_width];
-        let cr_row = &chroma_r[chroma_rowindex * br_width..(chroma_rowindex + 1) * br_width];
-        let rgba_row = &mut rgba[luma_rowindex * rgba_stride..(luma_rowindex + 1) * rgba_stride];
-
-        // Iterating on 2 pixels at a time, leaving off the last one if width is odd.
-        let y_iter = y_row.chunks_exact(2);
-        let cb_iter = cb_row.iter();
-        let cr_iter = cr_row.iter();
-        // Similar to how Y is iterated on, but with 4 channels per pixel
-        let rgba_iter = rgba_row.chunks_exact_mut(8);
-
-        for (((y, cb), cr), rgba) in y_iter.zip(cb_iter).zip(cr_iter).zip(rgba_iter) {
-            let rgb0 = yuv_to_rgb((y[0], *cb, *cr), luts);
-            let rgb1 = yuv_to_rgb((y[1], *cb, *cr), luts);
-            // The output alpha values are fixed
-            rgba.copy_from_slice(&[rgb0.0, rgb0.1, rgb0.2, 255, rgb1.0, rgb1.1, rgb1.2, 255]);
+        let y_remainder = y_width % 4;
+        let br_remainder = br_width % 2;
+        let rgba_remainder = y_remainder * 4;
+
+        // This block is here just so the mutable borrow of rgba_row expires sooner.
+        {
+            // These borrows only include whole chunks of lengths 4 and 2.
+            let y_row = &y[luma_rowindex * y_width..(luma_rowindex + 1) * y_width - y_remainder];
+            let cb_row = &chroma_b
+                [chroma_rowindex * br_width..(chroma_rowindex + 1) * br_width - br_remainder];
+            let cr_row = &chroma_r
+                [chroma_rowindex * br_width..(chroma_rowindex + 1) * br_width - br_remainder];
+            let rgba_row = &mut rgba
+                [luma_rowindex * rgba_stride..(luma_rowindex + 1) * rgba_stride - rgba_remainder];
+
+            // TODO: Replace `bytemuck::cast_slice` with `std::slice::array_chunks` when it's stable.
+
+            // Iterating on 4 pixels (in a horizontal row arrangement) at a time,
+            // leaving off the last few on the right if width is not divisible by 4.
+            let y_iter = bytemuck::cast_slice::<u8, [u8; 4]>(y_row).iter();
+            // We need half as many chroma samples for each iteration
+            let cb_iter = bytemuck::cast_slice::<u8, [u8; 2]>(cb_row).iter();
+            let cr_iter = bytemuck::cast_slice::<u8, [u8; 2]>(cr_row).iter();
+            // Similar to how Y is iterated on, but with 4 channels per pixel
+            let rgba_iter = bytemuck::cast_slice_mut::<u8, [u8; 16]>(rgba_row).iter_mut();
+
+            for (((y, cb), cr), rgba) in y_iter.zip(cb_iter).zip(cr_iter).zip(rgba_iter) {
+                yuv_to_rgba_4x((y, cb, cr), rgba);
+            }
         }
 
-        // On odd wide pictures, the last pixel is not covered by the iteration above,
-        // but is included in y_row and rgba_row.
-        if y_width % 2 == 1 {
-            let y = y_row.last().unwrap();
-            let cb = cb_row.last().unwrap();
-            let cr = cr_row.last().unwrap();
-
-            let rgb = yuv_to_rgb((*y, *cb, *cr), luts);
-
-            rgba_row[rgba_stride - 4..rgba_stride].copy_from_slice(&[rgb.0, rgb.1, rgb.2, 255])
+        // On pictures with width not divisible by 4, the last few pixels are not
+        // covered by the iteration above, so doing them here, at once in each row.
+        if y_remainder != 0 {
+            // These are the same borrows as above, but with the whole row, not rounded down to multiples of 4 or 2.
+            let y_row = &y[luma_rowindex * y_width..(luma_rowindex + 1) * y_width];
+            let cb_row = &chroma_b[chroma_rowindex * br_width..(chroma_rowindex + 1) * br_width];
+            let cr_row = &chroma_r[chroma_rowindex * br_width..(chroma_rowindex + 1) * br_width];
+            let rgba_row =
+                &mut rgba[luma_rowindex * rgba_stride..(luma_rowindex + 1) * rgba_stride];
+
+            let mut y = [0u8; 4];
+            let mut cb = [0u8; 2];
+            let mut cr = [0u8; 2];
+
+            for x in y_width - y_remainder..y_width {
+                y[x % 4] = y_row[x];
+                cb[(x % 4) / 2] = cb_row[x / 2];
+                cr[(x % 4) / 2] = cr_row[x / 2];
+            }
+
+            let mut rgba_4x = [0u8; 16];
+            yuv_to_rgba_4x((&y, &cb, &cr), &mut rgba_4x);
+
+            for i in rgba_stride - rgba_remainder..rgba_stride {
+                rgba_row[i] = rgba_4x[i % 16];
+            }
         }
     }
 
@@ -189,24 +212,24 @@ fn test_yuv_to_rgb() {
     // Peak colour difference = 16 and 240
 
     // not quite black
-    assert_eq!(yuv_to_rgb((17, 128, 128), &LUTS), (1, 1, 1));
+    assert_eq!(yuv_to_rgb((17, 128, 128)), (1, 1, 1));
     // exactly black
-    assert_eq!(yuv_to_rgb((16, 128, 128), &LUTS), (0, 0, 0));
+    assert_eq!(yuv_to_rgb((16, 128, 128)), (0, 0, 0));
     // and clamping also works
-    assert_eq!(yuv_to_rgb((15, 128, 128), &LUTS), (0, 0, 0));
-    assert_eq!(yuv_to_rgb((0, 128, 128), &LUTS), (0, 0, 0));
+    assert_eq!(yuv_to_rgb((15, 128, 128)), (0, 0, 0));
+    assert_eq!(yuv_to_rgb((0, 128, 128)), (0, 0, 0));
 
     // not quite white
-    assert_eq!(yuv_to_rgb((234, 128, 128), &LUTS), (254, 254, 254));
+    assert_eq!(yuv_to_rgb((234, 128, 128)), (254, 254, 254));
     // exactly white
-    assert_eq!(yuv_to_rgb((235, 128, 128), &LUTS), (255, 255, 255));
+    assert_eq!(yuv_to_rgb((235, 128, 128)), (255, 255, 255));
     // and clamping also works
-    assert_eq!(yuv_to_rgb((236, 128, 128), &LUTS), (255, 255, 255));
-    assert_eq!(yuv_to_rgb((255, 128, 128), &LUTS), (255, 255, 255));
+    assert_eq!(yuv_to_rgb((236, 128, 128)), (255, 255, 255));
+    assert_eq!(yuv_to_rgb((255, 128, 128)), (255, 255, 255));
 
     // (16 + 235) / 2 = 125.5, for middle grays
-    assert_eq!(yuv_to_rgb((125, 128, 128), &LUTS), (127, 127, 127));
-    assert_eq!(yuv_to_rgb((126, 128, 128), &LUTS), (128, 128, 128));
+    assert_eq!(yuv_to_rgb((125, 128, 128)), (127, 127, 127));
+    assert_eq!(yuv_to_rgb((126, 128, 128)), (128, 128, 128));
 }
 
 // Inverse conversion, for testing purposes only
@@ -257,43 +280,34 @@ fn test_rgb_to_yuv() {
 
 #[test]
 fn test_rgb_yuv_rgb_roundtrip_sanity() {
-    assert_eq!(yuv_to_rgb(rgb_to_yuv((0, 0, 0)), &LUTS), (0, 0, 0));
-    assert_eq!(
-        yuv_to_rgb(rgb_to_yuv((127, 127, 127)), &LUTS),
-        (127, 127, 127)
-    );
-    assert_eq!(
-        yuv_to_rgb(rgb_to_yuv((128, 128, 128)), &LUTS),
-        (128, 128, 128)
-    );
-    assert_eq!(
-        yuv_to_rgb(rgb_to_yuv((255, 255, 255)), &LUTS),
-        (255, 255, 255)
-    );
+    assert_eq!(yuv_to_rgb(rgb_to_yuv((0, 0, 0))), (0, 0, 0));
+    assert_eq!(yuv_to_rgb(rgb_to_yuv((127, 127, 127))), (127, 127, 127));
+    assert_eq!(yuv_to_rgb(rgb_to_yuv((128, 128, 128))), (128, 128, 128));
+    assert_eq!(yuv_to_rgb(rgb_to_yuv((255, 255, 255))), (255, 255, 255));
 
     assert_eq!(
-        yuv_to_rgb(rgb_to_yuv((255, 0, 0)), &LUTS),
+        yuv_to_rgb(rgb_to_yuv((255, 0, 0))),
         (254, 0, 0) // !!! there is a rounding error here
     );
     assert_eq!(
-        yuv_to_rgb(rgb_to_yuv((0, 255, 0)), &LUTS),
+        yuv_to_rgb(rgb_to_yuv((0, 255, 0))),
         (0, 255, 1) // !!! there is a rounding error here
     );
     assert_eq!(
-        yuv_to_rgb(rgb_to_yuv((0, 0, 255)), &LUTS),
+        yuv_to_rgb(rgb_to_yuv((0, 0, 255))),
         (0, 0, 255) // there is NO rounding error here
     );
 
     assert_eq!(
-        yuv_to_rgb(rgb_to_yuv((0, 255, 255)), &LUTS),
+        yuv_to_rgb(rgb_to_yuv((0, 255, 255))),
         (1, 255, 255) // !!! there is a rounding error here
     );
     assert_eq!(
-        yuv_to_rgb(rgb_to_yuv((255, 0, 255)), &LUTS),
+        yuv_to_rgb(rgb_to_yuv((255, 0, 255))),
         (255, 0, 254) // !!! there is a rounding error here
     );
     assert_eq!(
-        yuv_to_rgb(rgb_to_yuv((255, 255, 0)), &LUTS),
+        yuv_to_rgb(rgb_to_yuv((255, 255, 0))),
         (255, 255, 0) // there is NO rounding error here
     );
 
@@ -310,7 +324,7 @@ fn test_rgb_yuv_rgb_roundtrip_sanity() {
         (188, 189, 34),
         (23, 190, 207),
     ] {
-        let rgb2 = yuv_to_rgb(rgb_to_yuv(rgb), &LUTS);
+        let rgb2 = yuv_to_rgb(rgb_to_yuv(rgb));
         // Allowing for a difference of at most 1 on each component in both directions,
         // to account for the limited precision in YUV form, and two roundings
         assert!((rgb.0 as i32 - rgb2.0 as i32).abs() <= 1);
@@ -320,7 +334,7 @@ fn test_rgb_yuv_rgb_roundtrip_sanity() {
 }
 
 #[test]
-fn test_yuv420_to_rgba() {
+fn test_yuv420_to_rgba_tiny() {
     // empty picture
     assert_eq!(yuv420_to_rgba(&[], &[], &[], 0, 0), vec![0u8; 0]);
 
@@ -405,5 +419,73 @@ fn test_yuv420_to_rgba() {
     );
 
     // The middle row/column of pixels use the top/left row/column of chroma samples:
-    assert_eq!(yuv_to_rgb((125, 90, 240), &LUTS), (255, 51, 50));
+    assert_eq!(yuv_to_rgb((125, 90, 240)), (255, 51, 50));
+}
+
+#[test]
+fn test_yuv420_to_rgba_medium() {
+    // A 4x4 picture, red on the top, green on the bottom.
+    // This should be done by SIMD now.
+    #[rustfmt::skip]
+    assert_eq!(
+        yuv420_to_rgba(
+            &[ 81u8,  81u8,  81u8,  81u8,
+               81u8,  81u8,  81u8,  81u8,
+              145u8, 145u8, 145u8, 145u8,
+              145u8, 145u8, 145u8, 145u8],
+            &[ 90u8,  90u8,
+               54u8,  54u8],
+            &[240u8,  240u8,
+               34u8,  34u8],
+            4, 2),
+        vec![
+            254u8,   0u8,   0u8, 255u8,  254u8,   0u8,   0u8, 255u8,  254u8,   0u8,   0u8, 255u8, 254u8,   0u8,   0u8, 255u8, // red, with rounding error
+            254u8,   0u8,   0u8, 255u8,  254u8,   0u8,   0u8, 255u8,  254u8,   0u8,   0u8, 255u8, 254u8,   0u8,   0u8, 255u8, // red, with rounding error
+              0u8, 255u8,   1u8, 255u8,    0u8, 255u8,   1u8, 255u8,    0u8, 255u8,   1u8, 255u8,   0u8, 255u8,   1u8, 255u8, // green, with rounding error
+              0u8, 255u8,   1u8, 255u8,    0u8, 255u8,   1u8, 255u8,    0u8, 255u8,   1u8, 255u8,   0u8, 255u8,   1u8, 255u8, // green, with rounding error
+        ]
+    );
+
+    // A 5x4 picture, red on the top, green on the bottom.
+    // This should be done by SIMD now, plus one row of remainder.
+    #[rustfmt::skip]
+    assert_eq!(
+        yuv420_to_rgba(
+            &[ 81u8,  81u8,  81u8,  81u8,  81u8,
+               81u8,  81u8,  81u8,  81u8,  81u8,
+              145u8, 145u8, 145u8, 145u8, 145u8,
+              145u8, 145u8, 145u8, 145u8, 145u8],
+            &[ 90u8,  90u8,  90u8,
+               54u8,  54u8,  54u8],
+            &[240u8,  240u8, 240u8,
+               34u8,  34u8,  34u8],
+            5, 3),
+        vec![
+            254u8,   0u8,   0u8, 255u8,  254u8,   0u8,   0u8, 255u8,  254u8,   0u8,   0u8, 255u8, 254u8,   0u8,   0u8, 255u8, 254u8,   0u8,   0u8, 255u8,
+            254u8,   0u8,   0u8, 255u8,  254u8,   0u8,   0u8, 255u8,  254u8,   0u8,   0u8, 255u8, 254u8,   0u8,   0u8, 255u8, 254u8,   0u8,   0u8, 255u8,
+              0u8, 255u8,   1u8, 255u8,    0u8, 255u8,   1u8, 255u8,    0u8, 255u8,   1u8, 255u8,   0u8, 255u8,   1u8, 255u8,   0u8, 255u8,   1u8, 255u8,
+              0u8, 255u8,   1u8, 255u8,    0u8, 255u8,   1u8, 255u8,    0u8, 255u8,   1u8, 255u8,   0u8, 255u8,   1u8, 255u8,   0u8, 255u8,   1u8, 255u8,
+        ]
+    );
+
+    // Same as before, but the last column is upside down, to check if it uses the right values.
+    #[rustfmt::skip]
+    assert_eq!(
+        yuv420_to_rgba(
+            &[ 81u8,  81u8,  81u8,  81u8, 145u8,
+               81u8,  81u8,  81u8,  81u8, 145u8,
+              145u8, 145u8, 145u8, 145u8,  81u8,
+              145u8, 145u8, 145u8, 145u8,  81u8],
+            &[ 90u8,  90u8,  54u8,
+               54u8,  54u8,  90u8],
+            &[240u8, 240u8,  34u8,
+               34u8,  34u8, 240u8],
+            5, 3),
+        vec![
+            254u8,   0u8,   0u8, 255u8,  254u8,   0u8,   0u8, 255u8,  254u8,   0u8,   0u8, 255u8, 254u8,   0u8,   0u8, 255u8,   0u8, 255u8,   1u8, 255u8, // red, with rounding error
+            254u8,   0u8,   0u8, 255u8,  254u8,   0u8,   0u8, 255u8,  254u8,   0u8,   0u8, 255u8, 254u8,   0u8,   0u8, 255u8,   0u8, 255u8,   1u8, 255u8, // red, with rounding error
+              0u8, 255u8,   1u8, 255u8,    0u8, 255u8,   1u8, 255u8,    0u8, 255u8,   1u8, 255u8,   0u8, 255u8,   1u8, 255u8, 254u8,   0u8,   0u8, 255u8, // green, with rounding error
+              0u8, 255u8,   1u8, 255u8,    0u8, 255u8,   1u8, 255u8,    0u8, 255u8,   1u8, 255u8,   0u8, 255u8,   1u8, 255u8, 254u8,   0u8,   0u8, 255u8, // green, with rounding error
+        ]
+    );
 }